1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw %s -o - | FileCheck %s
3
4define <16 x i16> @test_16xi16_perm_mask0(<16 x i16> %vec) {
5; CHECK-LABEL: test_16xi16_perm_mask0:
6; CHECK:       # %bb.0:
7; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
8; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0
9; CHECK-NEXT:    retq
10  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
11  ret <16 x i16> %res
12}
13define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
14; CHECK-LABEL: test_masked_16xi16_perm_mask0:
15; CHECK:       # %bb.0:
16; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
17; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
18; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
19; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
20; CHECK-NEXT:    retq
21  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
22  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
23  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
24  ret <16 x i16> %res
25}
26
27define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %mask) {
28; CHECK-LABEL: test_masked_z_16xi16_perm_mask0:
29; CHECK:       # %bb.0:
30; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
31; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
32; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
33; CHECK-NEXT:    retq
34  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
35  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
36  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
37  ret <16 x i16> %res
38}
39define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
40; CHECK-LABEL: test_masked_16xi16_perm_mask1:
41; CHECK:       # %bb.0:
42; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
43; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
44; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
45; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
46; CHECK-NEXT:    retq
47  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
48  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
49  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
50  ret <16 x i16> %res
51}
52
53define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %mask) {
54; CHECK-LABEL: test_masked_z_16xi16_perm_mask1:
55; CHECK:       # %bb.0:
56; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
57; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
58; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
59; CHECK-NEXT:    retq
60  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
61  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
62  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
63  ret <16 x i16> %res
64}
65define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
66; CHECK-LABEL: test_masked_16xi16_perm_mask2:
67; CHECK:       # %bb.0:
68; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
69; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
70; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
71; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
72; CHECK-NEXT:    retq
73  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
74  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
75  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
76  ret <16 x i16> %res
77}
78
79define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %mask) {
80; CHECK-LABEL: test_masked_z_16xi16_perm_mask2:
81; CHECK:       # %bb.0:
82; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
83; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
84; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
85; CHECK-NEXT:    retq
86  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
87  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
88  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
89  ret <16 x i16> %res
90}
91define <16 x i16> @test_16xi16_perm_mask3(<16 x i16> %vec) {
92; CHECK-LABEL: test_16xi16_perm_mask3:
93; CHECK:       # %bb.0:
94; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
95; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0
96; CHECK-NEXT:    retq
97  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
98  ret <16 x i16> %res
99}
100define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
101; CHECK-LABEL: test_masked_16xi16_perm_mask3:
102; CHECK:       # %bb.0:
103; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
104; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
105; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
106; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
107; CHECK-NEXT:    retq
108  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
109  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
110  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
111  ret <16 x i16> %res
112}
113
114define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %mask) {
115; CHECK-LABEL: test_masked_z_16xi16_perm_mask3:
116; CHECK:       # %bb.0:
117; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
118; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
119; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
120; CHECK-NEXT:    retq
121  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
122  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
123  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
124  ret <16 x i16> %res
125}
126define <16 x i16> @test_16xi16_perm_mem_mask0(<16 x i16>* %vp) {
127; CHECK-LABEL: test_16xi16_perm_mem_mask0:
128; CHECK:       # %bb.0:
129; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
130; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0
131; CHECK-NEXT:    retq
132  %vec = load <16 x i16>, <16 x i16>* %vp
133  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
134  ret <16 x i16> %res
135}
136define <16 x i16> @test_masked_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
137; CHECK-LABEL: test_masked_16xi16_perm_mem_mask0:
138; CHECK:       # %bb.0:
139; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
140; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
141; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
142; CHECK-NEXT:    retq
143  %vec = load <16 x i16>, <16 x i16>* %vp
144  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
145  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
146  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
147  ret <16 x i16> %res
148}
149
150define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
151; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask0:
152; CHECK:       # %bb.0:
153; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
154; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
155; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
156; CHECK-NEXT:    retq
157  %vec = load <16 x i16>, <16 x i16>* %vp
158  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
159  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
160  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
161  ret <16 x i16> %res
162}
163
164define <16 x i16> @test_masked_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
165; CHECK-LABEL: test_masked_16xi16_perm_mem_mask1:
166; CHECK:       # %bb.0:
167; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
168; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
169; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
170; CHECK-NEXT:    retq
171  %vec = load <16 x i16>, <16 x i16>* %vp
172  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
173  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
174  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
175  ret <16 x i16> %res
176}
177
178define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
179; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask1:
180; CHECK:       # %bb.0:
181; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
182; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
183; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
184; CHECK-NEXT:    retq
185  %vec = load <16 x i16>, <16 x i16>* %vp
186  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
187  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
188  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
189  ret <16 x i16> %res
190}
191
192define <16 x i16> @test_masked_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
193; CHECK-LABEL: test_masked_16xi16_perm_mem_mask2:
194; CHECK:       # %bb.0:
195; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
196; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
197; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
198; CHECK-NEXT:    retq
199  %vec = load <16 x i16>, <16 x i16>* %vp
200  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
201  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
202  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
203  ret <16 x i16> %res
204}
205
206define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
207; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask2:
208; CHECK:       # %bb.0:
209; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
210; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
211; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
212; CHECK-NEXT:    retq
213  %vec = load <16 x i16>, <16 x i16>* %vp
214  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
215  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
216  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
217  ret <16 x i16> %res
218}
219
220define <16 x i16> @test_16xi16_perm_mem_mask3(<16 x i16>* %vp) {
221; CHECK-LABEL: test_16xi16_perm_mem_mask3:
222; CHECK:       # %bb.0:
223; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
224; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0
225; CHECK-NEXT:    retq
226  %vec = load <16 x i16>, <16 x i16>* %vp
227  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
228  ret <16 x i16> %res
229}
230define <16 x i16> @test_masked_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
231; CHECK-LABEL: test_masked_16xi16_perm_mem_mask3:
232; CHECK:       # %bb.0:
233; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
234; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
235; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
236; CHECK-NEXT:    retq
237  %vec = load <16 x i16>, <16 x i16>* %vp
238  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
239  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
240  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
241  ret <16 x i16> %res
242}
243
244define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
245; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask3:
246; CHECK:       # %bb.0:
247; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
248; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
249; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
250; CHECK-NEXT:    retq
251  %vec = load <16 x i16>, <16 x i16>* %vp
252  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
253  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
254  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
255  ret <16 x i16> %res
256}
257
258define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) {
259; CHECK-LABEL: test_32xi16_perm_mask0:
260; CHECK:       # %bb.0:
261; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10]
262; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
263; CHECK-NEXT:    retq
264  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
265  ret <32 x i16> %res
266}
267define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
268; CHECK-LABEL: test_masked_32xi16_perm_mask0:
269; CHECK:       # %bb.0:
270; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10]
271; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
272; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
273; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
274; CHECK-NEXT:    retq
275  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
276  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
277  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
278  ret <32 x i16> %res
279}
280
281define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %mask) {
282; CHECK-LABEL: test_masked_z_32xi16_perm_mask0:
283; CHECK:       # %bb.0:
284; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10]
285; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
286; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
287; CHECK-NEXT:    retq
288  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
289  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
290  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
291  ret <32 x i16> %res
292}
293define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
294; CHECK-LABEL: test_masked_32xi16_perm_mask1:
295; CHECK:       # %bb.0:
296; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16]
297; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
298; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
299; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
300; CHECK-NEXT:    retq
301  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
302  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
303  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
304  ret <32 x i16> %res
305}
306
307define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %mask) {
308; CHECK-LABEL: test_masked_z_32xi16_perm_mask1:
309; CHECK:       # %bb.0:
310; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16]
311; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
312; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
313; CHECK-NEXT:    retq
314  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
315  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
316  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
317  ret <32 x i16> %res
318}
319define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
320; CHECK-LABEL: test_masked_32xi16_perm_mask2:
321; CHECK:       # %bb.0:
322; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27]
323; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
324; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
325; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
326; CHECK-NEXT:    retq
327  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
328  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
329  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
330  ret <32 x i16> %res
331}
332
333define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %mask) {
334; CHECK-LABEL: test_masked_z_32xi16_perm_mask2:
335; CHECK:       # %bb.0:
336; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27]
337; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
338; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
339; CHECK-NEXT:    retq
340  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
341  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
342  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
343  ret <32 x i16> %res
344}
345define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) {
346; CHECK-LABEL: test_32xi16_perm_mask3:
347; CHECK:       # %bb.0:
348; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4]
349; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
350; CHECK-NEXT:    retq
351  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
352  ret <32 x i16> %res
353}
354define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
355; CHECK-LABEL: test_masked_32xi16_perm_mask3:
356; CHECK:       # %bb.0:
357; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4]
358; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
359; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
360; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
361; CHECK-NEXT:    retq
362  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
363  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
364  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
365  ret <32 x i16> %res
366}
367
368define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %mask) {
369; CHECK-LABEL: test_masked_z_32xi16_perm_mask3:
370; CHECK:       # %bb.0:
371; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4]
372; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
373; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
374; CHECK-NEXT:    retq
375  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
376  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
377  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
378  ret <32 x i16> %res
379}
380define <32 x i16> @test_32xi16_perm_mem_mask0(<32 x i16>* %vp) {
381; CHECK-LABEL: test_32xi16_perm_mem_mask0:
382; CHECK:       # %bb.0:
383; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12]
384; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0
385; CHECK-NEXT:    retq
386  %vec = load <32 x i16>, <32 x i16>* %vp
387  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
388  ret <32 x i16> %res
389}
390define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
391; CHECK-LABEL: test_masked_32xi16_perm_mem_mask0:
392; CHECK:       # %bb.0:
393; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12]
394; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
395; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
396; CHECK-NEXT:    retq
397  %vec = load <32 x i16>, <32 x i16>* %vp
398  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
399  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
400  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
401  ret <32 x i16> %res
402}
403
404define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
405; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask0:
406; CHECK:       # %bb.0:
407; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12]
408; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
409; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
410; CHECK-NEXT:    retq
411  %vec = load <32 x i16>, <32 x i16>* %vp
412  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
413  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
414  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
415  ret <32 x i16> %res
416}
417
418define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
419; CHECK-LABEL: test_masked_32xi16_perm_mem_mask1:
420; CHECK:       # %bb.0:
421; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6]
422; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
423; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
424; CHECK-NEXT:    retq
425  %vec = load <32 x i16>, <32 x i16>* %vp
426  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
427  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
428  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
429  ret <32 x i16> %res
430}
431
432define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
433; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask1:
434; CHECK:       # %bb.0:
435; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6]
436; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
437; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
438; CHECK-NEXT:    retq
439  %vec = load <32 x i16>, <32 x i16>* %vp
440  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
441  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
442  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
443  ret <32 x i16> %res
444}
445
446define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
447; CHECK-LABEL: test_masked_32xi16_perm_mem_mask2:
448; CHECK:       # %bb.0:
449; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25]
450; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
451; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
452; CHECK-NEXT:    retq
453  %vec = load <32 x i16>, <32 x i16>* %vp
454  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
455  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
456  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
457  ret <32 x i16> %res
458}
459
460define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
461; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask2:
462; CHECK:       # %bb.0:
463; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25]
464; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
465; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
466; CHECK-NEXT:    retq
467  %vec = load <32 x i16>, <32 x i16>* %vp
468  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
469  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
470  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
471  ret <32 x i16> %res
472}
473
474define <32 x i16> @test_32xi16_perm_mem_mask3(<32 x i16>* %vp) {
475; CHECK-LABEL: test_32xi16_perm_mem_mask3:
476; CHECK:       # %bb.0:
477; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27]
478; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0
479; CHECK-NEXT:    retq
480  %vec = load <32 x i16>, <32 x i16>* %vp
481  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
482  ret <32 x i16> %res
483}
484define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
485; CHECK-LABEL: test_masked_32xi16_perm_mem_mask3:
486; CHECK:       # %bb.0:
487; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27]
488; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
489; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
490; CHECK-NEXT:    retq
491  %vec = load <32 x i16>, <32 x i16>* %vp
492  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
493  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
494  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
495  ret <32 x i16> %res
496}
497
498define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
499; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask3:
500; CHECK:       # %bb.0:
501; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27]
502; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
503; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
504; CHECK-NEXT:    retq
505  %vec = load <32 x i16>, <32 x i16>* %vp
506  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
507  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
508  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
509  ret <32 x i16> %res
510}
511
512define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) {
513; CHECK-LABEL: test_8xi32_perm_mask0:
514; CHECK:       # %bb.0:
515; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6]
516; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
517; CHECK-NEXT:    retq
518  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
519  ret <8 x i32> %res
520}
521define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
522; CHECK-LABEL: test_masked_8xi32_perm_mask0:
523; CHECK:       # %bb.0:
524; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6]
525; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
526; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
527; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
528; CHECK-NEXT:    retq
529  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
530  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
531  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
532  ret <8 x i32> %res
533}
534
535define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
536; CHECK-LABEL: test_masked_z_8xi32_perm_mask0:
537; CHECK:       # %bb.0:
538; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6]
539; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
540; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
541; CHECK-NEXT:    retq
542  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
543  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
544  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
545  ret <8 x i32> %res
546}
547define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
548; CHECK-LABEL: test_masked_8xi32_perm_mask1:
549; CHECK:       # %bb.0:
550; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3]
551; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
552; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
553; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
554; CHECK-NEXT:    retq
555  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
556  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
557  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
558  ret <8 x i32> %res
559}
560
561define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
562; CHECK-LABEL: test_masked_z_8xi32_perm_mask1:
563; CHECK:       # %bb.0:
564; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3]
565; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
566; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
567; CHECK-NEXT:    retq
568  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
569  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
570  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
571  ret <8 x i32> %res
572}
573define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
574; CHECK-LABEL: test_masked_8xi32_perm_mask2:
575; CHECK:       # %bb.0:
576; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4]
577; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
578; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
579; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
580; CHECK-NEXT:    retq
581  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
582  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
583  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
584  ret <8 x i32> %res
585}
586
587define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
588; CHECK-LABEL: test_masked_z_8xi32_perm_mask2:
589; CHECK:       # %bb.0:
590; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4]
591; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
592; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
593; CHECK-NEXT:    retq
594  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
595  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
596  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
597  ret <8 x i32> %res
598}
599define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) {
600; CHECK-LABEL: test_8xi32_perm_mask3:
601; CHECK:       # %bb.0:
602; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0]
603; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
604; CHECK-NEXT:    retq
605  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
606  ret <8 x i32> %res
607}
608define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
609; CHECK-LABEL: test_masked_8xi32_perm_mask3:
610; CHECK:       # %bb.0:
611; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0]
612; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
613; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
614; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
615; CHECK-NEXT:    retq
616  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
617  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
618  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
619  ret <8 x i32> %res
620}
621
622define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
623; CHECK-LABEL: test_masked_z_8xi32_perm_mask3:
624; CHECK:       # %bb.0:
625; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0]
626; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
627; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
628; CHECK-NEXT:    retq
629  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
630  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
631  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
632  ret <8 x i32> %res
633}
634define <8 x i32> @test_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
635; CHECK-LABEL: test_8xi32_perm_mem_mask0:
636; CHECK:       # %bb.0:
637; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5]
638; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0
639; CHECK-NEXT:    retq
640  %vec = load <8 x i32>, <8 x i32>* %vp
641  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
642  ret <8 x i32> %res
643}
644define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
645; CHECK-LABEL: test_masked_8xi32_perm_mem_mask0:
646; CHECK:       # %bb.0:
647; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5]
648; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
649; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
650; CHECK-NEXT:    retq
651  %vec = load <8 x i32>, <8 x i32>* %vp
652  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
653  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
654  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
655  ret <8 x i32> %res
656}
657
658define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
659; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask0:
660; CHECK:       # %bb.0:
661; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5]
662; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
663; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
664; CHECK-NEXT:    retq
665  %vec = load <8 x i32>, <8 x i32>* %vp
666  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
667  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
668  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
669  ret <8 x i32> %res
670}
671
672define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
673; CHECK-LABEL: test_masked_8xi32_perm_mem_mask1:
674; CHECK:       # %bb.0:
675; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5]
676; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
677; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
678; CHECK-NEXT:    retq
679  %vec = load <8 x i32>, <8 x i32>* %vp
680  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
681  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
682  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
683  ret <8 x i32> %res
684}
685
686define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
687; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask1:
688; CHECK:       # %bb.0:
689; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5]
690; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
691; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
692; CHECK-NEXT:    retq
693  %vec = load <8 x i32>, <8 x i32>* %vp
694  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
695  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
696  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
697  ret <8 x i32> %res
698}
699
700define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
701; CHECK-LABEL: test_masked_8xi32_perm_mem_mask2:
702; CHECK:       # %bb.0:
703; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3]
704; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
705; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
706; CHECK-NEXT:    retq
707  %vec = load <8 x i32>, <8 x i32>* %vp
708  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
709  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
710  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
711  ret <8 x i32> %res
712}
713
714define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
715; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask2:
716; CHECK:       # %bb.0:
717; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3]
718; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
719; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
720; CHECK-NEXT:    retq
721  %vec = load <8 x i32>, <8 x i32>* %vp
722  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
723  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
724  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
725  ret <8 x i32> %res
726}
727
728define <8 x i32> @test_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
729; CHECK-LABEL: test_8xi32_perm_mem_mask3:
730; CHECK:       # %bb.0:
731; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5]
732; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0
733; CHECK-NEXT:    retq
734  %vec = load <8 x i32>, <8 x i32>* %vp
735  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
736  ret <8 x i32> %res
737}
738define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
739; CHECK-LABEL: test_masked_8xi32_perm_mem_mask3:
740; CHECK:       # %bb.0:
741; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5]
742; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
743; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
744; CHECK-NEXT:    retq
745  %vec = load <8 x i32>, <8 x i32>* %vp
746  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
747  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
748  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
749  ret <8 x i32> %res
750}
751
752define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
753; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask3:
754; CHECK:       # %bb.0:
755; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5]
756; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
757; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
758; CHECK-NEXT:    retq
759  %vec = load <8 x i32>, <8 x i32>* %vp
760  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
761  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
762  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
763  ret <8 x i32> %res
764}
765
766define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec) {
767; CHECK-LABEL: test_16xi32_perm_mask0:
768; CHECK:       # %bb.0:
769; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7]
770; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
771; CHECK-NEXT:    retq
772  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
773  ret <16 x i32> %res
774}
775define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
776; CHECK-LABEL: test_masked_16xi32_perm_mask0:
777; CHECK:       # %bb.0:
778; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7]
779; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
780; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
781; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
782; CHECK-NEXT:    retq
783  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
784  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
785  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
786  ret <16 x i32> %res
787}
788
789define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
790; CHECK-LABEL: test_masked_z_16xi32_perm_mask0:
791; CHECK:       # %bb.0:
792; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7]
793; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
794; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
795; CHECK-NEXT:    retq
796  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
797  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
798  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
799  ret <16 x i32> %res
800}
801define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
802; CHECK-LABEL: test_masked_16xi32_perm_mask1:
803; CHECK:       # %bb.0:
804; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3]
805; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
806; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
807; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
808; CHECK-NEXT:    retq
809  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
810  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
811  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
812  ret <16 x i32> %res
813}
814
815define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
816; CHECK-LABEL: test_masked_z_16xi32_perm_mask1:
817; CHECK:       # %bb.0:
818; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3]
819; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
820; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
821; CHECK-NEXT:    retq
822  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
823  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
824  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
825  ret <16 x i32> %res
826}
827define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
828; CHECK-LABEL: test_masked_16xi32_perm_mask2:
829; CHECK:       # %bb.0:
830; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5]
831; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
832; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
833; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
834; CHECK-NEXT:    retq
835  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
836  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
837  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
838  ret <16 x i32> %res
839}
840
841define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
842; CHECK-LABEL: test_masked_z_16xi32_perm_mask2:
843; CHECK:       # %bb.0:
844; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5]
845; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
846; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
847; CHECK-NEXT:    retq
848  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
849  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
850  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
851  ret <16 x i32> %res
852}
853define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) {
854; CHECK-LABEL: test_16xi32_perm_mask3:
855; CHECK:       # %bb.0:
856; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12]
857; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
858; CHECK-NEXT:    retq
859  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
860  ret <16 x i32> %res
861}
862define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
863; CHECK-LABEL: test_masked_16xi32_perm_mask3:
864; CHECK:       # %bb.0:
865; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12]
866; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
867; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
868; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
869; CHECK-NEXT:    retq
870  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
871  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
872  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
873  ret <16 x i32> %res
874}
875
876define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
877; CHECK-LABEL: test_masked_z_16xi32_perm_mask3:
878; CHECK:       # %bb.0:
879; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12]
880; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
881; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
882; CHECK-NEXT:    retq
883  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
884  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
885  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
886  ret <16 x i32> %res
887}
888define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
889; CHECK-LABEL: test_16xi32_perm_mem_mask0:
890; CHECK:       # %bb.0:
891; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6]
892; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
893; CHECK-NEXT:    retq
894  %vec = load <16 x i32>, <16 x i32>* %vp
895  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
896  ret <16 x i32> %res
897}
898define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
899; CHECK-LABEL: test_masked_16xi32_perm_mem_mask0:
900; CHECK:       # %bb.0:
901; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6]
902; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
903; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
904; CHECK-NEXT:    retq
905  %vec = load <16 x i32>, <16 x i32>* %vp
906  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
907  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
908  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
909  ret <16 x i32> %res
910}
911
912define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
913; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask0:
914; CHECK:       # %bb.0:
915; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6]
916; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
917; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
918; CHECK-NEXT:    retq
919  %vec = load <16 x i32>, <16 x i32>* %vp
920  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
921  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
922  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
923  ret <16 x i32> %res
924}
925
926define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
927; CHECK-LABEL: test_masked_16xi32_perm_mem_mask1:
928; CHECK:       # %bb.0:
929; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3]
930; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
931; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
932; CHECK-NEXT:    retq
933  %vec = load <16 x i32>, <16 x i32>* %vp
934  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
935  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
936  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
937  ret <16 x i32> %res
938}
939
940define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
941; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask1:
942; CHECK:       # %bb.0:
943; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3]
944; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
945; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
946; CHECK-NEXT:    retq
947  %vec = load <16 x i32>, <16 x i32>* %vp
948  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
949  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
950  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
951  ret <16 x i32> %res
952}
953
954define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
955; CHECK-LABEL: test_masked_16xi32_perm_mem_mask2:
956; CHECK:       # %bb.0:
957; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2]
958; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
959; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
960; CHECK-NEXT:    retq
961  %vec = load <16 x i32>, <16 x i32>* %vp
962  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
963  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
964  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
965  ret <16 x i32> %res
966}
967
968define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
969; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask2:
970; CHECK:       # %bb.0:
971; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2]
972; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
973; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
974; CHECK-NEXT:    retq
975  %vec = load <16 x i32>, <16 x i32>* %vp
976  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
977  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
978  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
979  ret <16 x i32> %res
980}
981
982define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
983; CHECK-LABEL: test_16xi32_perm_mem_mask3:
984; CHECK:       # %bb.0:
985; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1]
986; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
987; CHECK-NEXT:    retq
988  %vec = load <16 x i32>, <16 x i32>* %vp
989  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
990  ret <16 x i32> %res
991}
992define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
993; CHECK-LABEL: test_masked_16xi32_perm_mem_mask3:
994; CHECK:       # %bb.0:
995; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1]
996; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
997; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
998; CHECK-NEXT:    retq
999  %vec = load <16 x i32>, <16 x i32>* %vp
1000  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
1001  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1002  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
1003  ret <16 x i32> %res
1004}
1005
1006define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
1007; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask3:
1008; CHECK:       # %bb.0:
1009; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1]
1010; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
1011; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
1012; CHECK-NEXT:    retq
1013  %vec = load <16 x i32>, <16 x i32>* %vp
1014  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
1015  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1016  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
1017  ret <16 x i32> %res
1018}
1019
1020define <4 x i64> @test_4xi64_perm_mask0(<4 x i64> %vec) {
1021; CHECK-LABEL: test_4xi64_perm_mask0:
1022; CHECK:       # %bb.0:
1023; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1]
1024; CHECK-NEXT:    retq
1025  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
1026  ret <4 x i64> %res
1027}
1028define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1029; CHECK-LABEL: test_masked_4xi64_perm_mask0:
1030; CHECK:       # %bb.0:
1031; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
1032; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1]
1033; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1034; CHECK-NEXT:    retq
1035  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
1036  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1037  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1038  ret <4 x i64> %res
1039}
1040
1041define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask) {
1042; CHECK-LABEL: test_masked_z_4xi64_perm_mask0:
1043; CHECK:       # %bb.0:
1044; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
1045; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1]
1046; CHECK-NEXT:    retq
1047  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
1048  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1049  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1050  ret <4 x i64> %res
1051}
1052define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1053; CHECK-LABEL: test_masked_4xi64_perm_mask1:
1054; CHECK:       # %bb.0:
1055; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
1056; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3]
1057; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1058; CHECK-NEXT:    retq
1059  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
1060  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1061  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1062  ret <4 x i64> %res
1063}
1064
1065define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask) {
1066; CHECK-LABEL: test_masked_z_4xi64_perm_mask1:
1067; CHECK:       # %bb.0:
1068; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
1069; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3]
1070; CHECK-NEXT:    retq
1071  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
1072  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1073  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1074  ret <4 x i64> %res
1075}
1076define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1077; CHECK-LABEL: test_masked_4xi64_perm_mask2:
1078; CHECK:       # %bb.0:
1079; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
1080; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1]
1081; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1082; CHECK-NEXT:    retq
1083  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
1084  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1085  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1086  ret <4 x i64> %res
1087}
1088
1089define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %mask) {
1090; CHECK-LABEL: test_masked_z_4xi64_perm_mask2:
1091; CHECK:       # %bb.0:
1092; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
1093; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1]
1094; CHECK-NEXT:    retq
1095  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
1096  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1097  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1098  ret <4 x i64> %res
1099}
1100define <4 x i64> @test_4xi64_perm_mask3(<4 x i64> %vec) {
1101; CHECK-LABEL: test_4xi64_perm_mask3:
1102; CHECK:       # %bb.0:
1103; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
1104; CHECK-NEXT:    retq
1105  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
1106  ret <4 x i64> %res
1107}
1108define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1109; CHECK-LABEL: test_masked_4xi64_perm_mask3:
1110; CHECK:       # %bb.0:
1111; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
1112; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3]
1113; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1114; CHECK-NEXT:    retq
1115  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
1116  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1117  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1118  ret <4 x i64> %res
1119}
1120
1121define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %mask) {
1122; CHECK-LABEL: test_masked_z_4xi64_perm_mask3:
1123; CHECK:       # %bb.0:
1124; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
1125; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3]
1126; CHECK-NEXT:    retq
1127  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
1128  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1129  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1130  ret <4 x i64> %res
1131}
1132define <4 x i64> @test_4xi64_perm_mem_mask0(<4 x i64>* %vp) {
1133; CHECK-LABEL: test_4xi64_perm_mem_mask0:
1134; CHECK:       # %bb.0:
1135; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,1,2,0]
1136; CHECK-NEXT:    retq
1137  %vec = load <4 x i64>, <4 x i64>* %vp
1138  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
1139  ret <4 x i64> %res
1140}
1141define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
1142; CHECK-LABEL: test_masked_4xi64_perm_mem_mask0:
1143; CHECK:       # %bb.0:
1144; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
1145; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0]
1146; CHECK-NEXT:    retq
1147  %vec = load <4 x i64>, <4 x i64>* %vp
1148  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
1149  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1150  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1151  ret <4 x i64> %res
1152}
1153
1154define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %mask) {
1155; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask0:
1156; CHECK:       # %bb.0:
1157; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
1158; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0]
1159; CHECK-NEXT:    retq
1160  %vec = load <4 x i64>, <4 x i64>* %vp
1161  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
1162  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1163  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1164  ret <4 x i64> %res
1165}
1166
1167define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
1168; CHECK-LABEL: test_masked_4xi64_perm_mem_mask1:
1169; CHECK:       # %bb.0:
1170; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
1171; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1]
1172; CHECK-NEXT:    retq
1173  %vec = load <4 x i64>, <4 x i64>* %vp
1174  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
1175  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1176  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1177  ret <4 x i64> %res
1178}
1179
1180define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %mask) {
1181; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask1:
1182; CHECK:       # %bb.0:
1183; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
1184; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1]
1185; CHECK-NEXT:    retq
1186  %vec = load <4 x i64>, <4 x i64>* %vp
1187  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
1188  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1189  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1190  ret <4 x i64> %res
1191}
1192
1193define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
1194; CHECK-LABEL: test_masked_4xi64_perm_mem_mask2:
1195; CHECK:       # %bb.0:
1196; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
1197; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0]
1198; CHECK-NEXT:    retq
1199  %vec = load <4 x i64>, <4 x i64>* %vp
1200  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
1201  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1202  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1203  ret <4 x i64> %res
1204}
1205
1206define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %mask) {
1207; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask2:
1208; CHECK:       # %bb.0:
1209; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
1210; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0]
1211; CHECK-NEXT:    retq
1212  %vec = load <4 x i64>, <4 x i64>* %vp
1213  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
1214  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1215  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1216  ret <4 x i64> %res
1217}
1218
1219define <4 x i64> @test_4xi64_perm_mem_mask3(<4 x i64>* %vp) {
1220; CHECK-LABEL: test_4xi64_perm_mem_mask3:
1221; CHECK:       # %bb.0:
1222; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,0,1,3]
1223; CHECK-NEXT:    retq
1224  %vec = load <4 x i64>, <4 x i64>* %vp
1225  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
1226  ret <4 x i64> %res
1227}
1228define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
1229; CHECK-LABEL: test_masked_4xi64_perm_mem_mask3:
1230; CHECK:       # %bb.0:
1231; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
1232; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3]
1233; CHECK-NEXT:    retq
1234  %vec = load <4 x i64>, <4 x i64>* %vp
1235  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
1236  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1237  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1238  ret <4 x i64> %res
1239}
1240
1241define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %mask) {
1242; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask3:
1243; CHECK:       # %bb.0:
1244; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
1245; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3]
1246; CHECK-NEXT:    retq
1247  %vec = load <4 x i64>, <4 x i64>* %vp
1248  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
1249  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1250  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1251  ret <4 x i64> %res
1252}
1253
1254define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) {
1255; CHECK-LABEL: test_8xi64_perm_mask0:
1256; CHECK:       # %bb.0:
1257; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6]
1258; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
1259; CHECK-NEXT:    retq
1260  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
1261  ret <8 x i64> %res
1262}
1263define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
1264; CHECK-LABEL: test_masked_8xi64_perm_mask0:
1265; CHECK:       # %bb.0:
1266; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6]
1267; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
1268; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
1269; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1270; CHECK-NEXT:    retq
1271  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
1272  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1273  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
1274  ret <8 x i64> %res
1275}
1276
1277define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask) {
1278; CHECK-LABEL: test_masked_z_8xi64_perm_mask0:
1279; CHECK:       # %bb.0:
1280; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6]
1281; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
1282; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
1283; CHECK-NEXT:    retq
1284  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
1285  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1286  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1287  ret <8 x i64> %res
1288}
1289define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
1290; CHECK-LABEL: test_masked_8xi64_perm_imm_mask1:
1291; CHECK:       # %bb.0:
1292; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
1293; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5]
1294; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1295; CHECK-NEXT:    retq
1296  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
1297  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1298  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
1299  ret <8 x i64> %res
1300}
1301
1302define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %mask) {
1303; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask1:
1304; CHECK:       # %bb.0:
1305; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
1306; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5]
1307; CHECK-NEXT:    retq
1308  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
1309  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1310  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1311  ret <8 x i64> %res
1312}
1313define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
1314; CHECK-LABEL: test_masked_8xi64_perm_mask2:
1315; CHECK:       # %bb.0:
1316; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1]
1317; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
1318; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
1319; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1320; CHECK-NEXT:    retq
1321  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
1322  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1323  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
1324  ret <8 x i64> %res
1325}
1326
1327define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask) {
1328; CHECK-LABEL: test_masked_z_8xi64_perm_mask2:
1329; CHECK:       # %bb.0:
1330; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1]
1331; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
1332; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
1333; CHECK-NEXT:    retq
1334  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
1335  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1336  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1337  ret <8 x i64> %res
1338}
1339define <8 x i64> @test_8xi64_perm_imm_mask3(<8 x i64> %vec) {
1340; CHECK-LABEL: test_8xi64_perm_imm_mask3:
1341; CHECK:       # %bb.0:
1342; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5]
1343; CHECK-NEXT:    retq
1344  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
1345  ret <8 x i64> %res
1346}
1347define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
1348; CHECK-LABEL: test_masked_8xi64_perm_imm_mask3:
1349; CHECK:       # %bb.0:
1350; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
1351; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5]
1352; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1353; CHECK-NEXT:    retq
1354  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
1355  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1356  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
1357  ret <8 x i64> %res
1358}
1359
1360define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %mask) {
1361; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask3:
1362; CHECK:       # %bb.0:
1363; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
1364; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5]
1365; CHECK-NEXT:    retq
1366  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
1367  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1368  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1369  ret <8 x i64> %res
1370}
1371define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
1372; CHECK-LABEL: test_masked_8xi64_perm_mask4:
1373; CHECK:       # %bb.0:
1374; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3]
1375; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
1376; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
1377; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1378; CHECK-NEXT:    retq
1379  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
1380  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1381  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
1382  ret <8 x i64> %res
1383}
1384
1385define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask) {
1386; CHECK-LABEL: test_masked_z_8xi64_perm_mask4:
1387; CHECK:       # %bb.0:
1388; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3]
1389; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
1390; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
1391; CHECK-NEXT:    retq
1392  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
1393  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1394  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1395  ret <8 x i64> %res
1396}
1397define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
1398; CHECK-LABEL: test_masked_8xi64_perm_imm_mask5:
1399; CHECK:       # %bb.0:
1400; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
1401; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4]
1402; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1403; CHECK-NEXT:    retq
1404  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1405  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1406  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
1407  ret <8 x i64> %res
1408}
1409
1410define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %mask) {
1411; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask5:
1412; CHECK:       # %bb.0:
1413; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
1414; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1415; CHECK-NEXT:    retq
1416  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1417  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1418  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1419  ret <8 x i64> %res
1420}
1421define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) {
1422; CHECK-LABEL: test_8xi64_perm_mask6:
1423; CHECK:       # %bb.0:
1424; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7]
1425; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
1426; CHECK-NEXT:    retq
1427  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
1428  ret <8 x i64> %res
1429}
1430define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
1431; CHECK-LABEL: test_masked_8xi64_perm_mask6:
1432; CHECK:       # %bb.0:
1433; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7]
1434; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
1435; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
1436; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1437; CHECK-NEXT:    retq
1438  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
1439  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1440  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
1441  ret <8 x i64> %res
1442}
1443
1444define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask) {
1445; CHECK-LABEL: test_masked_z_8xi64_perm_mask6:
1446; CHECK:       # %bb.0:
1447; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7]
1448; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
1449; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
1450; CHECK-NEXT:    retq
1451  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
1452  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1453  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1454  ret <8 x i64> %res
1455}
1456define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
1457; CHECK-LABEL: test_masked_8xi64_perm_imm_mask7:
1458; CHECK:       # %bb.0:
1459; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
1460; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7]
1461; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1462; CHECK-NEXT:    retq
1463  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
1464  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1465  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
1466  ret <8 x i64> %res
1467}
1468
1469define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %mask) {
1470; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask7:
1471; CHECK:       # %bb.0:
1472; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
1473; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7]
1474; CHECK-NEXT:    retq
1475  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
1476  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1477  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1478  ret <8 x i64> %res
1479}
1480define <8 x i64> @test_8xi64_perm_mem_mask0(<8 x i64>* %vp) {
1481; CHECK-LABEL: test_8xi64_perm_mem_mask0:
1482; CHECK:       # %bb.0:
1483; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3]
1484; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
1485; CHECK-NEXT:    retq
1486  %vec = load <8 x i64>, <8 x i64>* %vp
1487  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
1488  ret <8 x i64> %res
1489}
1490define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
1491; CHECK-LABEL: test_masked_8xi64_perm_mem_mask0:
1492; CHECK:       # %bb.0:
1493; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3]
1494; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
1495; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
1496; CHECK-NEXT:    retq
1497  %vec = load <8 x i64>, <8 x i64>* %vp
1498  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
1499  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1500  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
1501  ret <8 x i64> %res
1502}
1503
1504define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %mask) {
1505; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask0:
1506; CHECK:       # %bb.0:
1507; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3]
1508; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
1509; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
1510; CHECK-NEXT:    retq
1511  %vec = load <8 x i64>, <8 x i64>* %vp
1512  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
1513  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1514  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1515  ret <8 x i64> %res
1516}
1517
1518define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
1519; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
1520; CHECK:       # %bb.0:
1521; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
1522; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4]
1523; CHECK-NEXT:    retq
1524  %vec = load <8 x i64>, <8 x i64>* %vp
1525  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
1526  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1527  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
1528  ret <8 x i64> %res
1529}
1530
1531define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %mask) {
1532; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
1533; CHECK:       # %bb.0:
1534; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
1535; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4]
1536; CHECK-NEXT:    retq
1537  %vec = load <8 x i64>, <8 x i64>* %vp
1538  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
1539  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1540  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1541  ret <8 x i64> %res
1542}
1543
1544define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
1545; CHECK-LABEL: test_masked_8xi64_perm_mem_mask2:
1546; CHECK:       # %bb.0:
1547; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5]
1548; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
1549; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
1550; CHECK-NEXT:    retq
1551  %vec = load <8 x i64>, <8 x i64>* %vp
1552  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
1553  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1554  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
1555  ret <8 x i64> %res
1556}
1557
1558define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %mask) {
1559; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask2:
1560; CHECK:       # %bb.0:
1561; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5]
1562; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
1563; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
1564; CHECK-NEXT:    retq
1565  %vec = load <8 x i64>, <8 x i64>* %vp
1566  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
1567  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1568  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1569  ret <8 x i64> %res
1570}
1571
1572define <8 x i64> @test_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) {
1573; CHECK-LABEL: test_8xi64_perm_imm_mem_mask3:
1574; CHECK:       # %bb.0:
1575; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5]
1576; CHECK-NEXT:    retq
1577  %vec = load <8 x i64>, <8 x i64>* %vp
1578  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
1579  ret <8 x i64> %res
1580}
1581define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
1582; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
1583; CHECK:       # %bb.0:
1584; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
1585; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5]
1586; CHECK-NEXT:    retq
1587  %vec = load <8 x i64>, <8 x i64>* %vp
1588  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
1589  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1590  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
1591  ret <8 x i64> %res
1592}
1593
1594define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %mask) {
1595; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
1596; CHECK:       # %bb.0:
1597; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
1598; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5]
1599; CHECK-NEXT:    retq
1600  %vec = load <8 x i64>, <8 x i64>* %vp
1601  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
1602  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1603  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1604  ret <8 x i64> %res
1605}
1606
1607define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
1608; CHECK-LABEL: test_masked_8xi64_perm_mem_mask4:
1609; CHECK:       # %bb.0:
1610; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6]
1611; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
1612; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
1613; CHECK-NEXT:    retq
1614  %vec = load <8 x i64>, <8 x i64>* %vp
1615  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
1616  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1617  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
1618  ret <8 x i64> %res
1619}
1620
1621define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %mask) {
1622; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask4:
1623; CHECK:       # %bb.0:
1624; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6]
1625; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
1626; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
1627; CHECK-NEXT:    retq
1628  %vec = load <8 x i64>, <8 x i64>* %vp
1629  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
1630  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1631  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1632  ret <8 x i64> %res
1633}
1634
1635define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
1636; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
1637; CHECK:       # %bb.0:
1638; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
1639; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4]
1640; CHECK-NEXT:    retq
1641  %vec = load <8 x i64>, <8 x i64>* %vp
1642  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
1643  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1644  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
1645  ret <8 x i64> %res
1646}
1647
1648define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %mask) {
1649; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
1650; CHECK:       # %bb.0:
1651; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
1652; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4]
1653; CHECK-NEXT:    retq
1654  %vec = load <8 x i64>, <8 x i64>* %vp
1655  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
1656  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1657  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1658  ret <8 x i64> %res
1659}
1660
1661define <8 x i64> @test_8xi64_perm_mem_mask6(<8 x i64>* %vp) {
1662; CHECK-LABEL: test_8xi64_perm_mem_mask6:
1663; CHECK:       # %bb.0:
1664; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6]
1665; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
1666; CHECK-NEXT:    retq
1667  %vec = load <8 x i64>, <8 x i64>* %vp
1668  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
1669  ret <8 x i64> %res
1670}
1671define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
1672; CHECK-LABEL: test_masked_8xi64_perm_mem_mask6:
1673; CHECK:       # %bb.0:
1674; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6]
1675; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
1676; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
1677; CHECK-NEXT:    retq
1678  %vec = load <8 x i64>, <8 x i64>* %vp
1679  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
1680  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1681  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
1682  ret <8 x i64> %res
1683}
1684
1685define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %mask) {
1686; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask6:
1687; CHECK:       # %bb.0:
1688; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6]
1689; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
1690; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
1691; CHECK-NEXT:    retq
1692  %vec = load <8 x i64>, <8 x i64>* %vp
1693  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
1694  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1695  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1696  ret <8 x i64> %res
1697}
1698
1699define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
1700; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
1701; CHECK:       # %bb.0:
1702; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
1703; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5]
1704; CHECK-NEXT:    retq
1705  %vec = load <8 x i64>, <8 x i64>* %vp
1706  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
1707  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1708  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
1709  ret <8 x i64> %res
1710}
1711
1712define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %mask) {
1713; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
1714; CHECK:       # %bb.0:
1715; CHECK-NEXT:    vptestnmq %zmm0, %zmm0, %k1
1716; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5]
1717; CHECK-NEXT:    retq
1718  %vec = load <8 x i64>, <8 x i64>* %vp
1719  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
1720  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1721  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1722  ret <8 x i64> %res
1723}
1724
1725define <8 x float> @test_8xfloat_perm_mask0(<8 x float> %vec) {
1726; CHECK-LABEL: test_8xfloat_perm_mask0:
1727; CHECK:       # %bb.0:
1728; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4]
1729; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1730; CHECK-NEXT:    retq
1731  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
1732  ret <8 x float> %res
1733}
1734define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
1735; CHECK-LABEL: test_masked_8xfloat_perm_mask0:
1736; CHECK:       # %bb.0:
1737; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4]
1738; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
1739; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
1740; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
1741; CHECK-NEXT:    vmovaps %ymm1, %ymm0
1742; CHECK-NEXT:    retq
1743  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
1744  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1745  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
1746  ret <8 x float> %res
1747}
1748
1749define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %mask) {
1750; CHECK-LABEL: test_masked_z_8xfloat_perm_mask0:
1751; CHECK:       # %bb.0:
1752; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4]
1753; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
1754; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
1755; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
1756; CHECK-NEXT:    retq
1757  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
1758  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1759  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
1760  ret <8 x float> %res
1761}
1762define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
1763; CHECK-LABEL: test_masked_8xfloat_perm_mask1:
1764; CHECK:       # %bb.0:
1765; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1]
1766; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
1767; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
1768; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
1769; CHECK-NEXT:    vmovaps %ymm1, %ymm0
1770; CHECK-NEXT:    retq
1771  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
1772  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1773  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
1774  ret <8 x float> %res
1775}
1776
1777define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %mask) {
1778; CHECK-LABEL: test_masked_z_8xfloat_perm_mask1:
1779; CHECK:       # %bb.0:
1780; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1]
1781; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
1782; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
1783; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
1784; CHECK-NEXT:    retq
1785  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
1786  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1787  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
1788  ret <8 x float> %res
1789}
1790define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
1791; CHECK-LABEL: test_masked_8xfloat_perm_mask2:
1792; CHECK:       # %bb.0:
1793; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5]
1794; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
1795; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
1796; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
1797; CHECK-NEXT:    vmovaps %ymm1, %ymm0
1798; CHECK-NEXT:    retq
1799  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
1800  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1801  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
1802  ret <8 x float> %res
1803}
1804
1805define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %mask) {
1806; CHECK-LABEL: test_masked_z_8xfloat_perm_mask2:
1807; CHECK:       # %bb.0:
1808; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5]
1809; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
1810; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
1811; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
1812; CHECK-NEXT:    retq
1813  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
1814  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1815  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
1816  ret <8 x float> %res
1817}
1818define <8 x float> @test_8xfloat_perm_mask3(<8 x float> %vec) {
1819; CHECK-LABEL: test_8xfloat_perm_mask3:
1820; CHECK:       # %bb.0:
1821; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6]
1822; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1823; CHECK-NEXT:    retq
1824  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
1825  ret <8 x float> %res
1826}
1827define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
1828; CHECK-LABEL: test_masked_8xfloat_perm_mask3:
1829; CHECK:       # %bb.0:
1830; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6]
1831; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
1832; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
1833; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
1834; CHECK-NEXT:    vmovaps %ymm1, %ymm0
1835; CHECK-NEXT:    retq
1836  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
1837  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1838  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
1839  ret <8 x float> %res
1840}
1841
1842define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %mask) {
1843; CHECK-LABEL: test_masked_z_8xfloat_perm_mask3:
1844; CHECK:       # %bb.0:
1845; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6]
1846; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
1847; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
1848; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
1849; CHECK-NEXT:    retq
1850  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
1851  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1852  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
1853  ret <8 x float> %res
1854}
1855define <8 x float> @test_8xfloat_perm_mem_mask0(<8 x float>* %vp) {
1856; CHECK-LABEL: test_8xfloat_perm_mem_mask0:
1857; CHECK:       # %bb.0:
1858; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0]
1859; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0
1860; CHECK-NEXT:    retq
1861  %vec = load <8 x float>, <8 x float>* %vp
1862  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
1863  ret <8 x float> %res
1864}
1865define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
1866; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask0:
1867; CHECK:       # %bb.0:
1868; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0]
1869; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
1870; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
1871; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
1872; CHECK-NEXT:    retq
1873  %vec = load <8 x float>, <8 x float>* %vp
1874  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
1875  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1876  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
1877  ret <8 x float> %res
1878}
1879
1880define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %mask) {
1881; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
1882; CHECK:       # %bb.0:
1883; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0]
1884; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1885; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
1886; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
1887; CHECK-NEXT:    retq
1888  %vec = load <8 x float>, <8 x float>* %vp
1889  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
1890  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1891  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
1892  ret <8 x float> %res
1893}
1894
1895define <8 x float> @test_masked_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
1896; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask1:
1897; CHECK:       # %bb.0:
1898; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6]
1899; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
1900; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
1901; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
1902; CHECK-NEXT:    retq
1903  %vec = load <8 x float>, <8 x float>* %vp
1904  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
1905  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1906  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
1907  ret <8 x float> %res
1908}
1909
1910define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x float> %mask) {
1911; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
1912; CHECK:       # %bb.0:
1913; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6]
1914; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1915; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
1916; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
1917; CHECK-NEXT:    retq
1918  %vec = load <8 x float>, <8 x float>* %vp
1919  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
1920  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1921  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
1922  ret <8 x float> %res
1923}
1924
1925define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
1926; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask2:
1927; CHECK:       # %bb.0:
1928; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4]
1929; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
1930; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
1931; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
1932; CHECK-NEXT:    retq
1933  %vec = load <8 x float>, <8 x float>* %vp
1934  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
1935  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1936  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
1937  ret <8 x float> %res
1938}
1939
1940define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %mask) {
1941; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
1942; CHECK:       # %bb.0:
1943; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4]
1944; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1945; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
1946; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
1947; CHECK-NEXT:    retq
1948  %vec = load <8 x float>, <8 x float>* %vp
1949  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
1950  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1951  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
1952  ret <8 x float> %res
1953}
1954
1955define <8 x float> @test_8xfloat_perm_mem_mask3(<8 x float>* %vp) {
1956; CHECK-LABEL: test_8xfloat_perm_mem_mask3:
1957; CHECK:       # %bb.0:
1958; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0]
1959; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0
1960; CHECK-NEXT:    retq
1961  %vec = load <8 x float>, <8 x float>* %vp
1962  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
1963  ret <8 x float> %res
1964}
1965define <8 x float> @test_masked_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
1966; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask3:
1967; CHECK:       # %bb.0:
1968; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0]
1969; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
1970; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
1971; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
1972; CHECK-NEXT:    retq
1973  %vec = load <8 x float>, <8 x float>* %vp
1974  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
1975  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1976  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
1977  ret <8 x float> %res
1978}
1979
1980define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x float> %mask) {
1981; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
1982; CHECK:       # %bb.0:
1983; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0]
1984; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1985; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
1986; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
1987; CHECK-NEXT:    retq
1988  %vec = load <8 x float>, <8 x float>* %vp
1989  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
1990  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1991  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
1992  ret <8 x float> %res
1993}
1994
1995define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) {
1996; CHECK-LABEL: test_16xfloat_perm_mask0:
1997; CHECK:       # %bb.0:
1998; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7]
1999; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
2000; CHECK-NEXT:    retq
2001  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
2002  ret <16 x float> %res
2003}
2004define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
2005; CHECK-LABEL: test_masked_16xfloat_perm_mask0:
2006; CHECK:       # %bb.0:
2007; CHECK-NEXT:    vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7]
2008; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
2009; CHECK-NEXT:    vcmpeqps %zmm4, %zmm2, %k1
2010; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
2011; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2012; CHECK-NEXT:    retq
2013  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
2014  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
2015  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
2016  ret <16 x float> %res
2017}
2018
2019define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %mask) {
2020; CHECK-LABEL: test_masked_z_16xfloat_perm_mask0:
2021; CHECK:       # %bb.0:
2022; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7]
2023; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2024; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
2025; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
2026; CHECK-NEXT:    retq
2027  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
2028  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
2029  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
2030  ret <16 x float> %res
2031}
2032define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
2033; CHECK-LABEL: test_masked_16xfloat_perm_mask1:
2034; CHECK:       # %bb.0:
2035; CHECK-NEXT:    vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1]
2036; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
2037; CHECK-NEXT:    vcmpeqps %zmm4, %zmm2, %k1
2038; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
2039; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2040; CHECK-NEXT:    retq
2041  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
2042  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
2043  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
2044  ret <16 x float> %res
2045}
2046
2047define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %mask) {
2048; CHECK-LABEL: test_masked_z_16xfloat_perm_mask1:
2049; CHECK:       # %bb.0:
2050; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1]
2051; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2052; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
2053; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
2054; CHECK-NEXT:    retq
2055  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
2056  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
2057  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
2058  ret <16 x float> %res
2059}
2060define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
2061; CHECK-LABEL: test_masked_16xfloat_perm_mask2:
2062; CHECK:       # %bb.0:
2063; CHECK-NEXT:    vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11]
2064; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
2065; CHECK-NEXT:    vcmpeqps %zmm4, %zmm2, %k1
2066; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
2067; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2068; CHECK-NEXT:    retq
2069  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
2070  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
2071  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
2072  ret <16 x float> %res
2073}
2074
2075define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %mask) {
2076; CHECK-LABEL: test_masked_z_16xfloat_perm_mask2:
2077; CHECK:       # %bb.0:
2078; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11]
2079; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2080; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
2081; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
2082; CHECK-NEXT:    retq
2083  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
2084  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
2085  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
2086  ret <16 x float> %res
2087}
2088define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) {
2089; CHECK-LABEL: test_16xfloat_perm_mask3:
2090; CHECK:       # %bb.0:
2091; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3]
2092; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
2093; CHECK-NEXT:    retq
2094  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
2095  ret <16 x float> %res
2096}
2097define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
2098; CHECK-LABEL: test_masked_16xfloat_perm_mask3:
2099; CHECK:       # %bb.0:
2100; CHECK-NEXT:    vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3]
2101; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
2102; CHECK-NEXT:    vcmpeqps %zmm4, %zmm2, %k1
2103; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
2104; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2105; CHECK-NEXT:    retq
2106  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
2107  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
2108  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
2109  ret <16 x float> %res
2110}
2111
2112define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %mask) {
2113; CHECK-LABEL: test_masked_z_16xfloat_perm_mask3:
2114; CHECK:       # %bb.0:
2115; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3]
2116; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2117; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
2118; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
2119; CHECK-NEXT:    retq
2120  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
2121  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
2122  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
2123  ret <16 x float> %res
2124}
2125define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) {
2126; CHECK-LABEL: test_16xfloat_perm_mem_mask0:
2127; CHECK:       # %bb.0:
2128; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1]
2129; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
2130; CHECK-NEXT:    retq
2131  %vec = load <16 x float>, <16 x float>* %vp
2132  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
2133  ret <16 x float> %res
2134}
2135define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
2136; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask0:
2137; CHECK:       # %bb.0:
2138; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1]
2139; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2140; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
2141; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
2142; CHECK-NEXT:    retq
2143  %vec = load <16 x float>, <16 x float>* %vp
2144  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
2145  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
2146  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
2147  ret <16 x float> %res
2148}
2149
2150define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %mask) {
2151; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
2152; CHECK:       # %bb.0:
2153; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1]
2154; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
2155; CHECK-NEXT:    vcmpeqps %zmm2, %zmm0, %k1
2156; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
2157; CHECK-NEXT:    retq
2158  %vec = load <16 x float>, <16 x float>* %vp
2159  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
2160  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
2161  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
2162  ret <16 x float> %res
2163}
2164
2165define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
2166; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask1:
2167; CHECK:       # %bb.0:
2168; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4]
2169; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2170; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
2171; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
2172; CHECK-NEXT:    retq
2173  %vec = load <16 x float>, <16 x float>* %vp
2174  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
2175  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
2176  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
2177  ret <16 x float> %res
2178}
2179
2180define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x float> %mask) {
2181; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
2182; CHECK:       # %bb.0:
2183; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4]
2184; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
2185; CHECK-NEXT:    vcmpeqps %zmm2, %zmm0, %k1
2186; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
2187; CHECK-NEXT:    retq
2188  %vec = load <16 x float>, <16 x float>* %vp
2189  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
2190  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
2191  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
2192  ret <16 x float> %res
2193}
2194
2195define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
2196; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask2:
2197; CHECK:       # %bb.0:
2198; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5]
2199; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2200; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
2201; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
2202; CHECK-NEXT:    retq
2203  %vec = load <16 x float>, <16 x float>* %vp
2204  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
2205  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
2206  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
2207  ret <16 x float> %res
2208}
2209
2210define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %mask) {
2211; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
2212; CHECK:       # %bb.0:
2213; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5]
2214; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
2215; CHECK-NEXT:    vcmpeqps %zmm2, %zmm0, %k1
2216; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
2217; CHECK-NEXT:    retq
2218  %vec = load <16 x float>, <16 x float>* %vp
2219  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
2220  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
2221  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
2222  ret <16 x float> %res
2223}
2224
2225define <16 x float> @test_16xfloat_perm_mem_mask3(<16 x float>* %vp) {
2226; CHECK-LABEL: test_16xfloat_perm_mem_mask3:
2227; CHECK:       # %bb.0:
2228; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0]
2229; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
2230; CHECK-NEXT:    retq
2231  %vec = load <16 x float>, <16 x float>* %vp
2232  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
2233  ret <16 x float> %res
2234}
2235define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
2236; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask3:
2237; CHECK:       # %bb.0:
2238; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0]
2239; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2240; CHECK-NEXT:    vcmpeqps %zmm3, %zmm1, %k1
2241; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
2242; CHECK-NEXT:    retq
2243  %vec = load <16 x float>, <16 x float>* %vp
2244  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
2245  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
2246  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
2247  ret <16 x float> %res
2248}
2249
2250define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x float> %mask) {
2251; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
2252; CHECK:       # %bb.0:
2253; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0]
2254; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
2255; CHECK-NEXT:    vcmpeqps %zmm2, %zmm0, %k1
2256; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
2257; CHECK-NEXT:    retq
2258  %vec = load <16 x float>, <16 x float>* %vp
2259  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
2260  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
2261  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
2262  ret <16 x float> %res
2263}
2264
2265define <4 x double> @test_4xdouble_perm_mask0(<4 x double> %vec) {
2266; CHECK-LABEL: test_4xdouble_perm_mask0:
2267; CHECK:       # %bb.0:
2268; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,2]
2269; CHECK-NEXT:    retq
2270  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
2271  ret <4 x double> %res
2272}
2273define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
2274; CHECK-LABEL: test_masked_4xdouble_perm_mask0:
2275; CHECK:       # %bb.0:
2276; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
2277; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
2278; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2]
2279; CHECK-NEXT:    vmovapd %ymm1, %ymm0
2280; CHECK-NEXT:    retq
2281  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
2282  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
2283  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
2284  ret <4 x double> %res
2285}
2286
2287define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %mask) {
2288; CHECK-LABEL: test_masked_z_4xdouble_perm_mask0:
2289; CHECK:       # %bb.0:
2290; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2291; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
2292; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2]
2293; CHECK-NEXT:    retq
2294  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
2295  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
2296  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
2297  ret <4 x double> %res
2298}
2299define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
2300; CHECK-LABEL: test_masked_4xdouble_perm_mask1:
2301; CHECK:       # %bb.0:
2302; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
2303; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
2304; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0]
2305; CHECK-NEXT:    vmovapd %ymm1, %ymm0
2306; CHECK-NEXT:    retq
2307  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2308  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
2309  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
2310  ret <4 x double> %res
2311}
2312
2313define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %mask) {
2314; CHECK-LABEL: test_masked_z_4xdouble_perm_mask1:
2315; CHECK:       # %bb.0:
2316; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2317; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
2318; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0]
2319; CHECK-NEXT:    retq
2320  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2321  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
2322  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
2323  ret <4 x double> %res
2324}
2325define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
2326; CHECK-LABEL: test_masked_4xdouble_perm_mask2:
2327; CHECK:       # %bb.0:
2328; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
2329; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
2330; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1]
2331; CHECK-NEXT:    vmovapd %ymm1, %ymm0
2332; CHECK-NEXT:    retq
2333  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
2334  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
2335  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
2336  ret <4 x double> %res
2337}
2338
2339define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %mask) {
2340; CHECK-LABEL: test_masked_z_4xdouble_perm_mask2:
2341; CHECK:       # %bb.0:
2342; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2343; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
2344; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1]
2345; CHECK-NEXT:    retq
2346  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
2347  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
2348  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
2349  ret <4 x double> %res
2350}
2351define <4 x double> @test_4xdouble_perm_mask3(<4 x double> %vec) {
2352; CHECK-LABEL: test_4xdouble_perm_mask3:
2353; CHECK:       # %bb.0:
2354; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,2]
2355; CHECK-NEXT:    retq
2356  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
2357  ret <4 x double> %res
2358}
2359define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
2360; CHECK-LABEL: test_masked_4xdouble_perm_mask3:
2361; CHECK:       # %bb.0:
2362; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
2363; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
2364; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2]
2365; CHECK-NEXT:    vmovapd %ymm1, %ymm0
2366; CHECK-NEXT:    retq
2367  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
2368  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
2369  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
2370  ret <4 x double> %res
2371}
2372
2373define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %mask) {
2374; CHECK-LABEL: test_masked_z_4xdouble_perm_mask3:
2375; CHECK:       # %bb.0:
2376; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2377; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
2378; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2]
2379; CHECK-NEXT:    retq
2380  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
2381  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
2382  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
2383  ret <4 x double> %res
2384}
2385define <4 x double> @test_4xdouble_perm_mem_mask0(<4 x double>* %vp) {
2386; CHECK-LABEL: test_4xdouble_perm_mem_mask0:
2387; CHECK:       # %bb.0:
2388; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[0,0,2,0]
2389; CHECK-NEXT:    retq
2390  %vec = load <4 x double>, <4 x double>* %vp
2391  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
2392  ret <4 x double> %res
2393}
2394define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
2395; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask0:
2396; CHECK:       # %bb.0:
2397; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2398; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
2399; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0]
2400; CHECK-NEXT:    retq
2401  %vec = load <4 x double>, <4 x double>* %vp
2402  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
2403  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
2404  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
2405  ret <4 x double> %res
2406}
2407
2408define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x double> %mask) {
2409; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
2410; CHECK:       # %bb.0:
2411; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
2412; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
2413; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0]
2414; CHECK-NEXT:    retq
2415  %vec = load <4 x double>, <4 x double>* %vp
2416  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
2417  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
2418  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
2419  ret <4 x double> %res
2420}
2421
2422define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
2423; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask1:
2424; CHECK:       # %bb.0:
2425; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2426; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
2427; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2]
2428; CHECK-NEXT:    retq
2429  %vec = load <4 x double>, <4 x double>* %vp
2430  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
2431  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
2432  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
2433  ret <4 x double> %res
2434}
2435
2436define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x double> %mask) {
2437; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
2438; CHECK:       # %bb.0:
2439; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
2440; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
2441; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2]
2442; CHECK-NEXT:    retq
2443  %vec = load <4 x double>, <4 x double>* %vp
2444  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
2445  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
2446  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
2447  ret <4 x double> %res
2448}
2449
2450define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
2451; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask2:
2452; CHECK:       # %bb.0:
2453; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2454; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
2455; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1]
2456; CHECK-NEXT:    retq
2457  %vec = load <4 x double>, <4 x double>* %vp
2458  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
2459  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
2460  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
2461  ret <4 x double> %res
2462}
2463
2464define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x double> %mask) {
2465; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
2466; CHECK:       # %bb.0:
2467; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
2468; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
2469; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1]
2470; CHECK-NEXT:    retq
2471  %vec = load <4 x double>, <4 x double>* %vp
2472  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
2473  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
2474  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
2475  ret <4 x double> %res
2476}
2477
2478define <4 x double> @test_4xdouble_perm_mem_mask3(<4 x double>* %vp) {
2479; CHECK-LABEL: test_4xdouble_perm_mem_mask3:
2480; CHECK:       # %bb.0:
2481; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,3,2]
2482; CHECK-NEXT:    retq
2483  %vec = load <4 x double>, <4 x double>* %vp
2484  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
2485  ret <4 x double> %res
2486}
2487define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
2488; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask3:
2489; CHECK:       # %bb.0:
2490; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2491; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
2492; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2]
2493; CHECK-NEXT:    retq
2494  %vec = load <4 x double>, <4 x double>* %vp
2495  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
2496  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
2497  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
2498  ret <4 x double> %res
2499}
2500
2501define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x double> %mask) {
2502; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
2503; CHECK:       # %bb.0:
2504; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
2505; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
2506; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2]
2507; CHECK-NEXT:    retq
2508  %vec = load <4 x double>, <4 x double>* %vp
2509  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
2510  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
2511  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
2512  ret <4 x double> %res
2513}
2514
2515define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) {
2516; CHECK-LABEL: test_8xdouble_perm_mask0:
2517; CHECK:       # %bb.0:
2518; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4]
2519; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
2520; CHECK-NEXT:    retq
2521  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
2522  ret <8 x double> %res
2523}
2524define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
2525; CHECK-LABEL: test_masked_8xdouble_perm_mask0:
2526; CHECK:       # %bb.0:
2527; CHECK-NEXT:    vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4]
2528; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
2529; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm2, %k1
2530; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
2531; CHECK-NEXT:    vmovapd %zmm1, %zmm0
2532; CHECK-NEXT:    retq
2533  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
2534  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2535  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
2536  ret <8 x double> %res
2537}
2538
2539define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %mask) {
2540; CHECK-LABEL: test_masked_z_8xdouble_perm_mask0:
2541; CHECK:       # %bb.0:
2542; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4]
2543; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
2544; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
2545; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
2546; CHECK-NEXT:    retq
2547  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
2548  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2549  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
2550  ret <8 x double> %res
2551}
2552define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
2553; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask1:
2554; CHECK:       # %bb.0:
2555; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
2556; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
2557; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6]
2558; CHECK-NEXT:    vmovapd %zmm1, %zmm0
2559; CHECK-NEXT:    retq
2560  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
2561  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2562  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
2563  ret <8 x double> %res
2564}
2565
2566define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x double> %mask) {
2567; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
2568; CHECK:       # %bb.0:
2569; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2570; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
2571; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6]
2572; CHECK-NEXT:    retq
2573  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
2574  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2575  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
2576  ret <8 x double> %res
2577}
2578define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
2579; CHECK-LABEL: test_masked_8xdouble_perm_mask2:
2580; CHECK:       # %bb.0:
2581; CHECK-NEXT:    vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7]
2582; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
2583; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm2, %k1
2584; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
2585; CHECK-NEXT:    vmovapd %zmm1, %zmm0
2586; CHECK-NEXT:    retq
2587  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
2588  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2589  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
2590  ret <8 x double> %res
2591}
2592
2593define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %mask) {
2594; CHECK-LABEL: test_masked_z_8xdouble_perm_mask2:
2595; CHECK:       # %bb.0:
2596; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7]
2597; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
2598; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
2599; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
2600; CHECK-NEXT:    retq
2601  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
2602  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2603  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
2604  ret <8 x double> %res
2605}
2606define <8 x double> @test_8xdouble_perm_imm_mask3(<8 x double> %vec) {
2607; CHECK-LABEL: test_8xdouble_perm_imm_mask3:
2608; CHECK:       # %bb.0:
2609; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4]
2610; CHECK-NEXT:    retq
2611  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
2612  ret <8 x double> %res
2613}
2614define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
2615; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask3:
2616; CHECK:       # %bb.0:
2617; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
2618; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
2619; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4]
2620; CHECK-NEXT:    vmovapd %zmm1, %zmm0
2621; CHECK-NEXT:    retq
2622  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
2623  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2624  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
2625  ret <8 x double> %res
2626}
2627
2628define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x double> %mask) {
2629; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
2630; CHECK:       # %bb.0:
2631; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2632; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
2633; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4]
2634; CHECK-NEXT:    retq
2635  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
2636  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2637  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
2638  ret <8 x double> %res
2639}
2640define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
2641; CHECK-LABEL: test_masked_8xdouble_perm_mask4:
2642; CHECK:       # %bb.0:
2643; CHECK-NEXT:    vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1]
2644; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
2645; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm2, %k1
2646; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
2647; CHECK-NEXT:    vmovapd %zmm1, %zmm0
2648; CHECK-NEXT:    retq
2649  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
2650  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2651  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
2652  ret <8 x double> %res
2653}
2654
2655define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %mask) {
2656; CHECK-LABEL: test_masked_z_8xdouble_perm_mask4:
2657; CHECK:       # %bb.0:
2658; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1]
2659; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
2660; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
2661; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
2662; CHECK-NEXT:    retq
2663  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
2664  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2665  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
2666  ret <8 x double> %res
2667}
2668define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
2669; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask5:
2670; CHECK:       # %bb.0:
2671; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
2672; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
2673; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7]
2674; CHECK-NEXT:    vmovapd %zmm1, %zmm0
2675; CHECK-NEXT:    retq
2676  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
2677  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2678  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
2679  ret <8 x double> %res
2680}
2681
2682define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x double> %mask) {
2683; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
2684; CHECK:       # %bb.0:
2685; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2686; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
2687; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7]
2688; CHECK-NEXT:    retq
2689  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
2690  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2691  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
2692  ret <8 x double> %res
2693}
2694define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) {
2695; CHECK-LABEL: test_8xdouble_perm_mask6:
2696; CHECK:       # %bb.0:
2697; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2]
2698; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
2699; CHECK-NEXT:    retq
2700  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
2701  ret <8 x double> %res
2702}
2703define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
2704; CHECK-LABEL: test_masked_8xdouble_perm_mask6:
2705; CHECK:       # %bb.0:
2706; CHECK-NEXT:    vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2]
2707; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
2708; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm2, %k1
2709; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
2710; CHECK-NEXT:    vmovapd %zmm1, %zmm0
2711; CHECK-NEXT:    retq
2712  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
2713  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2714  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
2715  ret <8 x double> %res
2716}
2717
2718define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %mask) {
2719; CHECK-LABEL: test_masked_z_8xdouble_perm_mask6:
2720; CHECK:       # %bb.0:
2721; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2]
2722; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
2723; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
2724; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
2725; CHECK-NEXT:    retq
2726  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
2727  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2728  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
2729  ret <8 x double> %res
2730}
2731define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
2732; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask7:
2733; CHECK:       # %bb.0:
2734; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
2735; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
2736; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6]
2737; CHECK-NEXT:    vmovapd %zmm1, %zmm0
2738; CHECK-NEXT:    retq
2739  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
2740  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2741  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
2742  ret <8 x double> %res
2743}
2744
2745define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x double> %mask) {
2746; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
2747; CHECK:       # %bb.0:
2748; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2749; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
2750; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6]
2751; CHECK-NEXT:    retq
2752  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
2753  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2754  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
2755  ret <8 x double> %res
2756}
2757define <8 x double> @test_8xdouble_perm_mem_mask0(<8 x double>* %vp) {
2758; CHECK-LABEL: test_8xdouble_perm_mem_mask0:
2759; CHECK:       # %bb.0:
2760; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1]
2761; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
2762; CHECK-NEXT:    retq
2763  %vec = load <8 x double>, <8 x double>* %vp
2764  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
2765  ret <8 x double> %res
2766}
2767define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
2768; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask0:
2769; CHECK:       # %bb.0:
2770; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1]
2771; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
2772; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
2773; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
2774; CHECK-NEXT:    retq
2775  %vec = load <8 x double>, <8 x double>* %vp
2776  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
2777  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2778  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
2779  ret <8 x double> %res
2780}
2781
2782define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x double> %mask) {
2783; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
2784; CHECK:       # %bb.0:
2785; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1]
2786; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2787; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm0, %k1
2788; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
2789; CHECK-NEXT:    retq
2790  %vec = load <8 x double>, <8 x double>* %vp
2791  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
2792  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2793  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
2794  ret <8 x double> %res
2795}
2796
2797define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
2798; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
2799; CHECK:       # %bb.0:
2800; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2801; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
2802; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7]
2803; CHECK-NEXT:    retq
2804  %vec = load <8 x double>, <8 x double>* %vp
2805  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
2806  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2807  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
2808  ret <8 x double> %res
2809}
2810
2811define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x double> %mask) {
2812; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
2813; CHECK:       # %bb.0:
2814; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
2815; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
2816; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7]
2817; CHECK-NEXT:    retq
2818  %vec = load <8 x double>, <8 x double>* %vp
2819  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
2820  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2821  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
2822  ret <8 x double> %res
2823}
2824
2825define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
2826; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask2:
2827; CHECK:       # %bb.0:
2828; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5]
2829; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
2830; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
2831; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
2832; CHECK-NEXT:    retq
2833  %vec = load <8 x double>, <8 x double>* %vp
2834  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
2835  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2836  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
2837  ret <8 x double> %res
2838}
2839
2840define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x double> %mask) {
2841; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
2842; CHECK:       # %bb.0:
2843; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5]
2844; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2845; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm0, %k1
2846; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
2847; CHECK-NEXT:    retq
2848  %vec = load <8 x double>, <8 x double>* %vp
2849  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
2850  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2851  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
2852  ret <8 x double> %res
2853}
2854
2855define <8 x double> @test_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp) {
2856; CHECK-LABEL: test_8xdouble_perm_imm_mem_mask3:
2857; CHECK:       # %bb.0:
2858; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4]
2859; CHECK-NEXT:    retq
2860  %vec = load <8 x double>, <8 x double>* %vp
2861  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
2862  ret <8 x double> %res
2863}
2864define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
2865; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
2866; CHECK:       # %bb.0:
2867; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2868; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
2869; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4]
2870; CHECK-NEXT:    retq
2871  %vec = load <8 x double>, <8 x double>* %vp
2872  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
2873  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2874  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
2875  ret <8 x double> %res
2876}
2877
2878define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x double> %mask) {
2879; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
2880; CHECK:       # %bb.0:
2881; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
2882; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
2883; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4]
2884; CHECK-NEXT:    retq
2885  %vec = load <8 x double>, <8 x double>* %vp
2886  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
2887  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2888  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
2889  ret <8 x double> %res
2890}
2891
2892define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
2893; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask4:
2894; CHECK:       # %bb.0:
2895; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0]
2896; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
2897; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
2898; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
2899; CHECK-NEXT:    retq
2900  %vec = load <8 x double>, <8 x double>* %vp
2901  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
2902  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2903  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
2904  ret <8 x double> %res
2905}
2906
2907define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x double> %mask) {
2908; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
2909; CHECK:       # %bb.0:
2910; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0]
2911; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2912; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm0, %k1
2913; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
2914; CHECK-NEXT:    retq
2915  %vec = load <8 x double>, <8 x double>* %vp
2916  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
2917  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2918  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
2919  ret <8 x double> %res
2920}
2921
2922define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
2923; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
2924; CHECK:       # %bb.0:
2925; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2926; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
2927; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7]
2928; CHECK-NEXT:    retq
2929  %vec = load <8 x double>, <8 x double>* %vp
2930  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
2931  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2932  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
2933  ret <8 x double> %res
2934}
2935
2936define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x double> %mask) {
2937; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
2938; CHECK:       # %bb.0:
2939; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
2940; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
2941; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7]
2942; CHECK-NEXT:    retq
2943  %vec = load <8 x double>, <8 x double>* %vp
2944  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
2945  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2946  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
2947  ret <8 x double> %res
2948}
2949
2950define <8 x double> @test_8xdouble_perm_mem_mask6(<8 x double>* %vp) {
2951; CHECK-LABEL: test_8xdouble_perm_mem_mask6:
2952; CHECK:       # %bb.0:
2953; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5]
2954; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
2955; CHECK-NEXT:    retq
2956  %vec = load <8 x double>, <8 x double>* %vp
2957  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
2958  ret <8 x double> %res
2959}
2960define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
2961; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask6:
2962; CHECK:       # %bb.0:
2963; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5]
2964; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
2965; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm1, %k1
2966; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
2967; CHECK-NEXT:    retq
2968  %vec = load <8 x double>, <8 x double>* %vp
2969  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
2970  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2971  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
2972  ret <8 x double> %res
2973}
2974
2975define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x double> %mask) {
2976; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
2977; CHECK:       # %bb.0:
2978; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5]
2979; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2980; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm0, %k1
2981; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
2982; CHECK-NEXT:    retq
2983  %vec = load <8 x double>, <8 x double>* %vp
2984  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
2985  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
2986  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
2987  ret <8 x double> %res
2988}
2989
2990define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
2991; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
2992; CHECK:       # %bb.0:
2993; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
2994; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
2995; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4]
2996; CHECK-NEXT:    retq
2997  %vec = load <8 x double>, <8 x double>* %vp
2998  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
2999  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
3000  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
3001  ret <8 x double> %res
3002}
3003
3004define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x double> %mask) {
3005; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
3006; CHECK:       # %bb.0:
3007; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
3008; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
3009; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4]
3010; CHECK-NEXT:    retq
3011  %vec = load <8 x double>, <8 x double>* %vp
3012  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
3013  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
3014  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
3015  ret <8 x double> %res
3016}
3017
3018