1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw %s -o - | FileCheck %s
3
4define <16 x i8> @test_16xi8_perm_mask0(<16 x i8> %vec) {
5; CHECK-LABEL: test_16xi8_perm_mask0:
6; CHECK:       # %bb.0:
7; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
8; CHECK-NEXT:    retq
9  %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
10  ret <16 x i8> %res
11}
12define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
13; CHECK-LABEL: test_masked_16xi8_perm_mask0:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    vptestnmb %xmm2, %xmm2, %k1
16; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
17; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
18; CHECK-NEXT:    retq
19  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
20  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
21  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
22  ret <16 x i8> %res
23}
24
25define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask) {
26; CHECK-LABEL: test_masked_z_16xi8_perm_mask0:
27; CHECK:       # %bb.0:
28; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
29; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
30; CHECK-NEXT:    retq
31  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
32  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
33  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
34  ret <16 x i8> %res
35}
36define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
37; CHECK-LABEL: test_masked_16xi8_perm_mask1:
38; CHECK:       # %bb.0:
39; CHECK-NEXT:    vptestnmb %xmm2, %xmm2, %k1
40; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
41; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
42; CHECK-NEXT:    retq
43  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
44  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
45  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
46  ret <16 x i8> %res
47}
48
49define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask) {
50; CHECK-LABEL: test_masked_z_16xi8_perm_mask1:
51; CHECK:       # %bb.0:
52; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
53; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
54; CHECK-NEXT:    retq
55  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
56  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
57  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
58  ret <16 x i8> %res
59}
60define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
61; CHECK-LABEL: test_masked_16xi8_perm_mask2:
62; CHECK:       # %bb.0:
63; CHECK-NEXT:    vptestnmb %xmm2, %xmm2, %k1
64; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
65; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
66; CHECK-NEXT:    retq
67  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
68  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
69  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
70  ret <16 x i8> %res
71}
72
73define <16 x i8> @test_masked_z_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %mask) {
74; CHECK-LABEL: test_masked_z_16xi8_perm_mask2:
75; CHECK:       # %bb.0:
76; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
77; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
78; CHECK-NEXT:    retq
79  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
80  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
81  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
82  ret <16 x i8> %res
83}
84define <16 x i8> @test_16xi8_perm_mask3(<16 x i8> %vec) {
85; CHECK-LABEL: test_16xi8_perm_mask3:
86; CHECK:       # %bb.0:
87; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
88; CHECK-NEXT:    retq
89  %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
90  ret <16 x i8> %res
91}
92define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
93; CHECK-LABEL: test_masked_16xi8_perm_mask3:
94; CHECK:       # %bb.0:
95; CHECK-NEXT:    vptestnmb %xmm2, %xmm2, %k1
96; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
97; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
98; CHECK-NEXT:    retq
99  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
100  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
101  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
102  ret <16 x i8> %res
103}
104
105define <16 x i8> @test_masked_z_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %mask) {
106; CHECK-LABEL: test_masked_z_16xi8_perm_mask3:
107; CHECK:       # %bb.0:
108; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
109; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
110; CHECK-NEXT:    retq
111  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
112  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
113  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
114  ret <16 x i8> %res
115}
116define <16 x i8> @test_16xi8_perm_mem_mask0(<16 x i8>* %vp) {
117; CHECK-LABEL: test_16xi8_perm_mem_mask0:
118; CHECK:       # %bb.0:
119; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
120; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
121; CHECK-NEXT:    retq
122  %vec = load <16 x i8>, <16 x i8>* %vp
123  %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
124  ret <16 x i8> %res
125}
126define <16 x i8> @test_masked_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
127; CHECK-LABEL: test_masked_16xi8_perm_mem_mask0:
128; CHECK:       # %bb.0:
129; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
130; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
131; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
132; CHECK-NEXT:    retq
133  %vec = load <16 x i8>, <16 x i8>* %vp
134  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
135  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
136  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
137  ret <16 x i8> %res
138}
139
140define <16 x i8> @test_masked_z_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %mask) {
141; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask0:
142; CHECK:       # %bb.0:
143; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
144; CHECK-NEXT:    vptestnmb %xmm0, %xmm0, %k1
145; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
146; CHECK-NEXT:    retq
147  %vec = load <16 x i8>, <16 x i8>* %vp
148  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
149  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
150  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
151  ret <16 x i8> %res
152}
153
154define <16 x i8> @test_masked_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
155; CHECK-LABEL: test_masked_16xi8_perm_mem_mask1:
156; CHECK:       # %bb.0:
157; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
158; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
159; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
160; CHECK-NEXT:    retq
161  %vec = load <16 x i8>, <16 x i8>* %vp
162  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
163  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
164  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
165  ret <16 x i8> %res
166}
167
168define <16 x i8> @test_masked_z_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %mask) {
169; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask1:
170; CHECK:       # %bb.0:
171; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
172; CHECK-NEXT:    vptestnmb %xmm0, %xmm0, %k1
173; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
174; CHECK-NEXT:    retq
175  %vec = load <16 x i8>, <16 x i8>* %vp
176  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
177  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
178  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
179  ret <16 x i8> %res
180}
181
182define <16 x i8> @test_masked_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
183; CHECK-LABEL: test_masked_16xi8_perm_mem_mask2:
184; CHECK:       # %bb.0:
185; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
186; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
187; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
188; CHECK-NEXT:    retq
189  %vec = load <16 x i8>, <16 x i8>* %vp
190  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
191  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
192  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
193  ret <16 x i8> %res
194}
195
196define <16 x i8> @test_masked_z_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %mask) {
197; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask2:
198; CHECK:       # %bb.0:
199; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
200; CHECK-NEXT:    vptestnmb %xmm0, %xmm0, %k1
201; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
202; CHECK-NEXT:    retq
203  %vec = load <16 x i8>, <16 x i8>* %vp
204  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
205  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
206  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
207  ret <16 x i8> %res
208}
209
210define <16 x i8> @test_16xi8_perm_mem_mask3(<16 x i8>* %vp) {
211; CHECK-LABEL: test_16xi8_perm_mem_mask3:
212; CHECK:       # %bb.0:
213; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
214; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
215; CHECK-NEXT:    retq
216  %vec = load <16 x i8>, <16 x i8>* %vp
217  %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
218  ret <16 x i8> %res
219}
220define <16 x i8> @test_masked_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
221; CHECK-LABEL: test_masked_16xi8_perm_mem_mask3:
222; CHECK:       # %bb.0:
223; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
224; CHECK-NEXT:    vptestnmb %xmm1, %xmm1, %k1
225; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
226; CHECK-NEXT:    retq
227  %vec = load <16 x i8>, <16 x i8>* %vp
228  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
229  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
230  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
231  ret <16 x i8> %res
232}
233
234define <16 x i8> @test_masked_z_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %mask) {
235; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask3:
236; CHECK:       # %bb.0:
237; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
238; CHECK-NEXT:    vptestnmb %xmm0, %xmm0, %k1
239; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
240; CHECK-NEXT:    retq
241  %vec = load <16 x i8>, <16 x i8>* %vp
242  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
243  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
244  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
245  ret <16 x i8> %res
246}
247
248define <32 x i8> @test_32xi8_perm_mask0(<32 x i8> %vec) {
249; CHECK-LABEL: test_32xi8_perm_mask0:
250; CHECK:       # %bb.0:
251; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21]
252; CHECK-NEXT:    retq
253  %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
254  ret <32 x i8> %res
255}
256define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
257; CHECK-LABEL: test_masked_32xi8_perm_mask0:
258; CHECK:       # %bb.0:
259; CHECK-NEXT:    vptestnmb %ymm2, %ymm2, %k1
260; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21]
261; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
262; CHECK-NEXT:    retq
263  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
264  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
265  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
266  ret <32 x i8> %res
267}
268
269define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask) {
270; CHECK-LABEL: test_masked_z_32xi8_perm_mask0:
271; CHECK:       # %bb.0:
272; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
273; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21]
274; CHECK-NEXT:    retq
275  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
276  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
277  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
278  ret <32 x i8> %res
279}
280define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
281; CHECK-LABEL: test_masked_32xi8_perm_mask1:
282; CHECK:       # %bb.0:
283; CHECK-NEXT:    vptestnmb %ymm2, %ymm2, %k1
284; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24]
285; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
286; CHECK-NEXT:    retq
287  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
288  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
289  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
290  ret <32 x i8> %res
291}
292
293define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask) {
294; CHECK-LABEL: test_masked_z_32xi8_perm_mask1:
295; CHECK:       # %bb.0:
296; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
297; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24]
298; CHECK-NEXT:    retq
299  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
300  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
301  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
302  ret <32 x i8> %res
303}
304define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
305; CHECK-LABEL: test_masked_32xi8_perm_mask2:
306; CHECK:       # %bb.0:
307; CHECK-NEXT:    vptestnmb %ymm2, %ymm2, %k1
308; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29]
309; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
310; CHECK-NEXT:    retq
311  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
312  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
313  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
314  ret <32 x i8> %res
315}
316
317define <32 x i8> @test_masked_z_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %mask) {
318; CHECK-LABEL: test_masked_z_32xi8_perm_mask2:
319; CHECK:       # %bb.0:
320; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
321; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29]
322; CHECK-NEXT:    retq
323  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
324  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
325  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
326  ret <32 x i8> %res
327}
328define <32 x i8> @test_32xi8_perm_mask3(<32 x i8> %vec) {
329; CHECK-LABEL: test_32xi8_perm_mask3:
330; CHECK:       # %bb.0:
331; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18]
332; CHECK-NEXT:    retq
333  %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
334  ret <32 x i8> %res
335}
336define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
337; CHECK-LABEL: test_masked_32xi8_perm_mask3:
338; CHECK:       # %bb.0:
339; CHECK-NEXT:    vptestnmb %ymm2, %ymm2, %k1
340; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18]
341; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
342; CHECK-NEXT:    retq
343  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
344  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
345  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
346  ret <32 x i8> %res
347}
348
349define <32 x i8> @test_masked_z_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %mask) {
350; CHECK-LABEL: test_masked_z_32xi8_perm_mask3:
351; CHECK:       # %bb.0:
352; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
353; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18]
354; CHECK-NEXT:    retq
355  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
356  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
357  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
358  ret <32 x i8> %res
359}
360define <32 x i8> @test_32xi8_perm_mem_mask0(<32 x i8>* %vp) {
361; CHECK-LABEL: test_32xi8_perm_mem_mask0:
362; CHECK:       # %bb.0:
363; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
364; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22]
365; CHECK-NEXT:    retq
366  %vec = load <32 x i8>, <32 x i8>* %vp
367  %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
368  ret <32 x i8> %res
369}
370define <32 x i8> @test_masked_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
371; CHECK-LABEL: test_masked_32xi8_perm_mem_mask0:
372; CHECK:       # %bb.0:
373; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
374; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
375; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22]
376; CHECK-NEXT:    retq
377  %vec = load <32 x i8>, <32 x i8>* %vp
378  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
379  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
380  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
381  ret <32 x i8> %res
382}
383
384define <32 x i8> @test_masked_z_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %mask) {
385; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask0:
386; CHECK:       # %bb.0:
387; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
388; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
389; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22]
390; CHECK-NEXT:    retq
391  %vec = load <32 x i8>, <32 x i8>* %vp
392  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
393  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
394  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
395  ret <32 x i8> %res
396}
397
398define <32 x i8> @test_masked_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
399; CHECK-LABEL: test_masked_32xi8_perm_mem_mask1:
400; CHECK:       # %bb.0:
401; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
402; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
403; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19]
404; CHECK-NEXT:    retq
405  %vec = load <32 x i8>, <32 x i8>* %vp
406  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 15, i32 10, i32 1, i32 1, i32 11, i32 0, i32 0, i32 6, i32 8, i32 7, i32 7, i32 9, i32 10, i32 6, i32 5, i32 15, i32 20, i32 28, i32 22, i32 21, i32 17, i32 29, i32 27, i32 30, i32 23, i32 26, i32 17, i32 22, i32 19, i32 16, i32 31, i32 19>
407  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
408  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
409  ret <32 x i8> %res
410}
411
412define <32 x i8> @test_masked_z_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %mask) {
413; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask1:
414; CHECK:       # %bb.0:
415; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
416; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
417; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19]
418; CHECK-NEXT:    retq
419  %vec = load <32 x i8>, <32 x i8>* %vp
420  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 15, i32 10, i32 1, i32 1, i32 11, i32 0, i32 0, i32 6, i32 8, i32 7, i32 7, i32 9, i32 10, i32 6, i32 5, i32 15, i32 20, i32 28, i32 22, i32 21, i32 17, i32 29, i32 27, i32 30, i32 23, i32 26, i32 17, i32 22, i32 19, i32 16, i32 31, i32 19>
421  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
422  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
423  ret <32 x i8> %res
424}
425
426define <32 x i8> @test_masked_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
427; CHECK-LABEL: test_masked_32xi8_perm_mem_mask2:
428; CHECK:       # %bb.0:
429; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
430; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
431; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28]
432; CHECK-NEXT:    retq
433  %vec = load <32 x i8>, <32 x i8>* %vp
434  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 6, i32 8, i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7, i32 14, i32 5, i32 7, i32 7, i32 26, i32 19, i32 25, i32 19, i32 21, i32 31, i32 30, i32 29, i32 16, i32 18, i32 20, i32 28, i32 29, i32 25, i32 27, i32 28>
435  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
436  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
437  ret <32 x i8> %res
438}
439
440define <32 x i8> @test_masked_z_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %mask) {
441; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask2:
442; CHECK:       # %bb.0:
443; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
444; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
445; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28]
446; CHECK-NEXT:    retq
447  %vec = load <32 x i8>, <32 x i8>* %vp
448  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 6, i32 8, i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7, i32 14, i32 5, i32 7, i32 7, i32 26, i32 19, i32 25, i32 19, i32 21, i32 31, i32 30, i32 29, i32 16, i32 18, i32 20, i32 28, i32 29, i32 25, i32 27, i32 28>
449  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
450  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
451  ret <32 x i8> %res
452}
453
454define <32 x i8> @test_32xi8_perm_mem_mask3(<32 x i8>* %vp) {
455; CHECK-LABEL: test_32xi8_perm_mem_mask3:
456; CHECK:       # %bb.0:
457; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
458; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29]
459; CHECK-NEXT:    retq
460  %vec = load <32 x i8>, <32 x i8>* %vp
461  %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
462  ret <32 x i8> %res
463}
464define <32 x i8> @test_masked_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
465; CHECK-LABEL: test_masked_32xi8_perm_mem_mask3:
466; CHECK:       # %bb.0:
467; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
468; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k1
469; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29]
470; CHECK-NEXT:    retq
471  %vec = load <32 x i8>, <32 x i8>* %vp
472  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
473  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
474  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
475  ret <32 x i8> %res
476}
477
478define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %mask) {
479; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask3:
480; CHECK:       # %bb.0:
481; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
482; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
483; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29]
484; CHECK-NEXT:    retq
485  %vec = load <32 x i8>, <32 x i8>* %vp
486  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
487  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
488  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
489  ret <32 x i8> %res
490}
491
492define <64 x i8> @test_64xi8_perm_mask0(<64 x i8> %vec) {
493; CHECK-LABEL: test_64xi8_perm_mask0:
494; CHECK:       # %bb.0:
495; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
496; CHECK-NEXT:    retq
497  %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
498  ret <64 x i8> %res
499}
500define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
501; CHECK-LABEL: test_masked_64xi8_perm_mask0:
502; CHECK:       # %bb.0:
503; CHECK-NEXT:    vptestnmb %zmm2, %zmm2, %k1
504; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
505; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
506; CHECK-NEXT:    retq
507  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
508  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
509  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
510  ret <64 x i8> %res
511}
512
513define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask) {
514; CHECK-LABEL: test_masked_z_64xi8_perm_mask0:
515; CHECK:       # %bb.0:
516; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
517; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
518; CHECK-NEXT:    retq
519  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
520  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
521  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
522  ret <64 x i8> %res
523}
524define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
525; CHECK-LABEL: test_masked_64xi8_perm_mask1:
526; CHECK:       # %bb.0:
527; CHECK-NEXT:    vptestnmb %zmm2, %zmm2, %k1
528; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49]
529; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
530; CHECK-NEXT:    retq
531  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
532  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
533  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
534  ret <64 x i8> %res
535}
536
537define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask) {
538; CHECK-LABEL: test_masked_z_64xi8_perm_mask1:
539; CHECK:       # %bb.0:
540; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
541; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49]
542; CHECK-NEXT:    retq
543  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
544  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
545  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
546  ret <64 x i8> %res
547}
548define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
549; CHECK-LABEL: test_masked_64xi8_perm_mask2:
550; CHECK:       # %bb.0:
551; CHECK-NEXT:    vptestnmb %zmm2, %zmm2, %k1
552; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60]
553; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
554; CHECK-NEXT:    retq
555  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
556  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
557  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
558  ret <64 x i8> %res
559}
560
561define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %mask) {
562; CHECK-LABEL: test_masked_z_64xi8_perm_mask2:
563; CHECK:       # %bb.0:
564; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
565; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60]
566; CHECK-NEXT:    retq
567  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
568  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
569  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
570  ret <64 x i8> %res
571}
572define <64 x i8> @test_64xi8_perm_mask3(<64 x i8> %vec) {
573; CHECK-LABEL: test_64xi8_perm_mask3:
574; CHECK:       # %bb.0:
575; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
576; CHECK-NEXT:    retq
577  %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
578  ret <64 x i8> %res
579}
580define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
581; CHECK-LABEL: test_masked_64xi8_perm_mask3:
582; CHECK:       # %bb.0:
583; CHECK-NEXT:    vptestnmb %zmm2, %zmm2, %k1
584; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
585; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
586; CHECK-NEXT:    retq
587  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
588  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
589  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
590  ret <64 x i8> %res
591}
592
593define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %mask) {
594; CHECK-LABEL: test_masked_z_64xi8_perm_mask3:
595; CHECK:       # %bb.0:
596; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
597; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
598; CHECK-NEXT:    retq
599  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
600  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
601  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
602  ret <64 x i8> %res
603}
604define <64 x i8> @test_64xi8_perm_mem_mask0(<64 x i8>* %vp) {
605; CHECK-LABEL: test_64xi8_perm_mem_mask0:
606; CHECK:       # %bb.0:
607; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
608; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
609; CHECK-NEXT:    retq
610  %vec = load <64 x i8>, <64 x i8>* %vp
611  %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
612  ret <64 x i8> %res
613}
614define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
615; CHECK-LABEL: test_masked_64xi8_perm_mem_mask0:
616; CHECK:       # %bb.0:
617; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
618; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
619; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
620; CHECK-NEXT:    retq
621  %vec = load <64 x i8>, <64 x i8>* %vp
622  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
623  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
624  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
625  ret <64 x i8> %res
626}
627
628define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %mask) {
629; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask0:
630; CHECK:       # %bb.0:
631; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
632; CHECK-NEXT:    vptestnmb %zmm0, %zmm0, %k1
633; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
634; CHECK-NEXT:    retq
635  %vec = load <64 x i8>, <64 x i8>* %vp
636  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
637  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
638  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
639  ret <64 x i8> %res
640}
641
642define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
643; CHECK-LABEL: test_masked_64xi8_perm_mem_mask1:
644; CHECK:       # %bb.0:
645; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
646; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
647; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49]
648; CHECK-NEXT:    retq
649  %vec = load <64 x i8>, <64 x i8>* %vp
650  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 15, i32 6, i32 14, i32 7, i32 5, i32 1, i32 14, i32 12, i32 5, i32 7, i32 5, i32 0, i32 0, i32 5, i32 3, i32 8, i32 19, i32 19, i32 26, i32 27, i32 20, i32 29, i32 20, i32 21, i32 27, i32 16, i32 30, i32 17, i32 23, i32 27, i32 16, i32 28, i32 47, i32 39, i32 33, i32 33, i32 33, i32 44, i32 38, i32 46, i32 39, i32 33, i32 38, i32 44, i32 45, i32 32, i32 34, i32 39, i32 50, i32 61, i32 62, i32 53, i32 54, i32 56, i32 52, i32 56, i32 51, i32 52, i32 55, i32 57, i32 56, i32 52, i32 51, i32 49>
651  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
652  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
653  ret <64 x i8> %res
654}
655
656define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %mask) {
657; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask1:
658; CHECK:       # %bb.0:
659; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
660; CHECK-NEXT:    vptestnmb %zmm0, %zmm0, %k1
661; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49]
662; CHECK-NEXT:    retq
663  %vec = load <64 x i8>, <64 x i8>* %vp
664  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 15, i32 6, i32 14, i32 7, i32 5, i32 1, i32 14, i32 12, i32 5, i32 7, i32 5, i32 0, i32 0, i32 5, i32 3, i32 8, i32 19, i32 19, i32 26, i32 27, i32 20, i32 29, i32 20, i32 21, i32 27, i32 16, i32 30, i32 17, i32 23, i32 27, i32 16, i32 28, i32 47, i32 39, i32 33, i32 33, i32 33, i32 44, i32 38, i32 46, i32 39, i32 33, i32 38, i32 44, i32 45, i32 32, i32 34, i32 39, i32 50, i32 61, i32 62, i32 53, i32 54, i32 56, i32 52, i32 56, i32 51, i32 52, i32 55, i32 57, i32 56, i32 52, i32 51, i32 49>
665  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
666  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
667  ret <64 x i8> %res
668}
669
670define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
671; CHECK-LABEL: test_masked_64xi8_perm_mem_mask2:
672; CHECK:       # %bb.0:
673; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
674; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
675; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61]
676; CHECK-NEXT:    retq
677  %vec = load <64 x i8>, <64 x i8>* %vp
678  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 12, i32 1, i32 11, i32 3, i32 4, i32 11, i32 10, i32 11, i32 8, i32 13, i32 1, i32 10, i32 1, i32 11, i32 5, i32 10, i32 27, i32 26, i32 19, i32 29, i32 19, i32 24, i32 26, i32 19, i32 26, i32 20, i32 18, i32 28, i32 24, i32 21, i32 25, i32 16, i32 34, i32 38, i32 47, i32 40, i32 33, i32 44, i32 44, i32 44, i32 41, i32 43, i32 35, i32 43, i32 45, i32 44, i32 37, i32 41, i32 58, i32 62, i32 49, i32 61, i32 56, i32 53, i32 55, i32 48, i32 51, i32 58, i32 58, i32 55, i32 63, i32 55, i32 53, i32 61>
679  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
680  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
681  ret <64 x i8> %res
682}
683
684define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %mask) {
685; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask2:
686; CHECK:       # %bb.0:
687; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
688; CHECK-NEXT:    vptestnmb %zmm0, %zmm0, %k1
689; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61]
690; CHECK-NEXT:    retq
691  %vec = load <64 x i8>, <64 x i8>* %vp
692  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 12, i32 1, i32 11, i32 3, i32 4, i32 11, i32 10, i32 11, i32 8, i32 13, i32 1, i32 10, i32 1, i32 11, i32 5, i32 10, i32 27, i32 26, i32 19, i32 29, i32 19, i32 24, i32 26, i32 19, i32 26, i32 20, i32 18, i32 28, i32 24, i32 21, i32 25, i32 16, i32 34, i32 38, i32 47, i32 40, i32 33, i32 44, i32 44, i32 44, i32 41, i32 43, i32 35, i32 43, i32 45, i32 44, i32 37, i32 41, i32 58, i32 62, i32 49, i32 61, i32 56, i32 53, i32 55, i32 48, i32 51, i32 58, i32 58, i32 55, i32 63, i32 55, i32 53, i32 61>
693  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
694  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
695  ret <64 x i8> %res
696}
697
698define <64 x i8> @test_64xi8_perm_mem_mask3(<64 x i8>* %vp) {
699; CHECK-LABEL: test_64xi8_perm_mem_mask3:
700; CHECK:       # %bb.0:
701; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
702; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
703; CHECK-NEXT:    retq
704  %vec = load <64 x i8>, <64 x i8>* %vp
705  %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
706  ret <64 x i8> %res
707}
708define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
709; CHECK-LABEL: test_masked_64xi8_perm_mem_mask3:
710; CHECK:       # %bb.0:
711; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
712; CHECK-NEXT:    vptestnmb %zmm1, %zmm1, %k1
713; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
714; CHECK-NEXT:    retq
715  %vec = load <64 x i8>, <64 x i8>* %vp
716  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
717  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
718  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
719  ret <64 x i8> %res
720}
721
722define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %mask) {
723; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask3:
724; CHECK:       # %bb.0:
725; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
726; CHECK-NEXT:    vptestnmb %zmm0, %zmm0, %k1
727; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
728; CHECK-NEXT:    retq
729  %vec = load <64 x i8>, <64 x i8>* %vp
730  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
731  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
732  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
733  ret <64 x i8> %res
734}
735
736define <8 x i16> @test_8xi16_perm_high_mask0(<8 x i16> %vec) {
737; CHECK-LABEL: test_8xi16_perm_high_mask0:
738; CHECK:       # %bb.0:
739; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,6]
740; CHECK-NEXT:    retq
741  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
742  ret <8 x i16> %res
743}
744define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
745; CHECK-LABEL: test_masked_8xi16_perm_high_mask0:
746; CHECK:       # %bb.0:
747; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
748; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6]
749; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
750; CHECK-NEXT:    retq
751  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
752  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
753  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
754  ret <8 x i16> %res
755}
756
757define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %mask) {
758; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask0:
759; CHECK:       # %bb.0:
760; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
761; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6]
762; CHECK-NEXT:    retq
763  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
764  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
765  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
766  ret <8 x i16> %res
767}
768define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
769; CHECK-LABEL: test_masked_8xi16_perm_low_mask1:
770; CHECK:       # %bb.0:
771; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
772; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7]
773; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
774; CHECK-NEXT:    retq
775  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
776  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
777  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
778  ret <8 x i16> %res
779}
780
781define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %mask) {
782; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask1:
783; CHECK:       # %bb.0:
784; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
785; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7]
786; CHECK-NEXT:    retq
787  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
788  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
789  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
790  ret <8 x i16> %res
791}
792define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
793; CHECK-LABEL: test_masked_8xi16_perm_high_mask2:
794; CHECK:       # %bb.0:
795; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
796; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5]
797; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
798; CHECK-NEXT:    retq
799  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
800  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
801  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
802  ret <8 x i16> %res
803}
804
805define <8 x i16> @test_masked_z_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %mask) {
806; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask2:
807; CHECK:       # %bb.0:
808; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
809; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5]
810; CHECK-NEXT:    retq
811  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
812  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
813  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
814  ret <8 x i16> %res
815}
816define <8 x i16> @test_8xi16_perm_low_mask3(<8 x i16> %vec) {
817; CHECK-LABEL: test_8xi16_perm_low_mask3:
818; CHECK:       # %bb.0:
819; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,1,1,4,5,6,7]
820; CHECK-NEXT:    retq
821  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
822  ret <8 x i16> %res
823}
824define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
825; CHECK-LABEL: test_masked_8xi16_perm_low_mask3:
826; CHECK:       # %bb.0:
827; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
828; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7]
829; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
830; CHECK-NEXT:    retq
831  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
832  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
833  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
834  ret <8 x i16> %res
835}
836
837define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %mask) {
838; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask3:
839; CHECK:       # %bb.0:
840; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
841; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7]
842; CHECK-NEXT:    retq
843  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
844  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
845  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
846  ret <8 x i16> %res
847}
848define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
849; CHECK-LABEL: test_masked_8xi16_perm_high_mask4:
850; CHECK:       # %bb.0:
851; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
852; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6]
853; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
854; CHECK-NEXT:    retq
855  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
856  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
857  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
858  ret <8 x i16> %res
859}
860
861define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %mask) {
862; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask4:
863; CHECK:       # %bb.0:
864; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
865; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6]
866; CHECK-NEXT:    retq
867  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
868  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
869  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
870  ret <8 x i16> %res
871}
872define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
873; CHECK-LABEL: test_masked_8xi16_perm_low_mask5:
874; CHECK:       # %bb.0:
875; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
876; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7]
877; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
878; CHECK-NEXT:    retq
879  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
880  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
881  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
882  ret <8 x i16> %res
883}
884
885define <8 x i16> @test_masked_z_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %mask) {
886; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask5:
887; CHECK:       # %bb.0:
888; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
889; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7]
890; CHECK-NEXT:    retq
891  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
892  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
893  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
894  ret <8 x i16> %res
895}
896define <8 x i16> @test_8xi16_perm_high_mask6(<8 x i16> %vec) {
897; CHECK-LABEL: test_8xi16_perm_high_mask6:
898; CHECK:       # %bb.0:
899; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,5]
900; CHECK-NEXT:    retq
901  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
902  ret <8 x i16> %res
903}
904define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
905; CHECK-LABEL: test_masked_8xi16_perm_high_mask6:
906; CHECK:       # %bb.0:
907; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
908; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5]
909; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
910; CHECK-NEXT:    retq
911  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
912  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
913  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
914  ret <8 x i16> %res
915}
916
917define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %mask) {
918; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask6:
919; CHECK:       # %bb.0:
920; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
921; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5]
922; CHECK-NEXT:    retq
923  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
924  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
925  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
926  ret <8 x i16> %res
927}
928define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
929; CHECK-LABEL: test_masked_8xi16_perm_low_mask7:
930; CHECK:       # %bb.0:
931; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
932; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7]
933; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
934; CHECK-NEXT:    retq
935  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
936  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
937  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
938  ret <8 x i16> %res
939}
940
941define <8 x i16> @test_masked_z_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %mask) {
942; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask7:
943; CHECK:       # %bb.0:
944; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
945; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7]
946; CHECK-NEXT:    retq
947  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
948  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
949  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
950  ret <8 x i16> %res
951}
952define <8 x i16> @test_8xi16_perm_high_mem_mask0(<8 x i16>* %vp) {
953; CHECK-LABEL: test_8xi16_perm_high_mem_mask0:
954; CHECK:       # %bb.0:
955; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,7,4,6]
956; CHECK-NEXT:    retq
957  %vec = load <8 x i16>, <8 x i16>* %vp
958  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
959  ret <8 x i16> %res
960}
961define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
962; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask0:
963; CHECK:       # %bb.0:
964; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
965; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6]
966; CHECK-NEXT:    retq
967  %vec = load <8 x i16>, <8 x i16>* %vp
968  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
969  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
970  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
971  ret <8 x i16> %res
972}
973
974define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %mask) {
975; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
976; CHECK:       # %bb.0:
977; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
978; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6]
979; CHECK-NEXT:    retq
980  %vec = load <8 x i16>, <8 x i16>* %vp
981  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
982  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
983  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
984  ret <8 x i16> %res
985}
986
987define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
988; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask1:
989; CHECK:       # %bb.0:
990; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
991; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7]
992; CHECK-NEXT:    retq
993  %vec = load <8 x i16>, <8 x i16>* %vp
994  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
995  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
996  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
997  ret <8 x i16> %res
998}
999
1000define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %mask) {
1001; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
1002; CHECK:       # %bb.0:
1003; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
1004; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7]
1005; CHECK-NEXT:    retq
1006  %vec = load <8 x i16>, <8 x i16>* %vp
1007  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
1008  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
1009  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
1010  ret <8 x i16> %res
1011}
1012
1013define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
1014; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask2:
1015; CHECK:       # %bb.0:
1016; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
1017; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7]
1018; CHECK-NEXT:    retq
1019  %vec = load <8 x i16>, <8 x i16>* %vp
1020  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 6, i32 5, i32 7>
1021  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
1022  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
1023  ret <8 x i16> %res
1024}
1025
1026define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %mask) {
1027; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
1028; CHECK:       # %bb.0:
1029; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
1030; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7]
1031; CHECK-NEXT:    retq
1032  %vec = load <8 x i16>, <8 x i16>* %vp
1033  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 6, i32 5, i32 7>
1034  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
1035  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
1036  ret <8 x i16> %res
1037}
1038
1039define <8 x i16> @test_8xi16_perm_low_mem_mask3(<8 x i16>* %vp) {
1040; CHECK-LABEL: test_8xi16_perm_low_mem_mask3:
1041; CHECK:       # %bb.0:
1042; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[3,1,2,0,4,5,6,7]
1043; CHECK-NEXT:    retq
1044  %vec = load <8 x i16>, <8 x i16>* %vp
1045  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
1046  ret <8 x i16> %res
1047}
1048define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
1049; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask3:
1050; CHECK:       # %bb.0:
1051; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
1052; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7]
1053; CHECK-NEXT:    retq
1054  %vec = load <8 x i16>, <8 x i16>* %vp
1055  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
1056  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
1057  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
1058  ret <8 x i16> %res
1059}
1060
1061define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %mask) {
1062; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
1063; CHECK:       # %bb.0:
1064; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
1065; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7]
1066; CHECK-NEXT:    retq
1067  %vec = load <8 x i16>, <8 x i16>* %vp
1068  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
1069  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
1070  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
1071  ret <8 x i16> %res
1072}
1073
1074define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
1075; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask4:
1076; CHECK:       # %bb.0:
1077; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
1078; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5]
1079; CHECK-NEXT:    retq
1080  %vec = load <8 x i16>, <8 x i16>* %vp
1081  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 7, i32 5>
1082  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
1083  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
1084  ret <8 x i16> %res
1085}
1086
1087define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %mask) {
1088; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
1089; CHECK:       # %bb.0:
1090; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
1091; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5]
1092; CHECK-NEXT:    retq
1093  %vec = load <8 x i16>, <8 x i16>* %vp
1094  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 7, i32 5>
1095  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
1096  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
1097  ret <8 x i16> %res
1098}
1099
1100define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
1101; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask5:
1102; CHECK:       # %bb.0:
1103; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
1104; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7]
1105; CHECK-NEXT:    retq
1106  %vec = load <8 x i16>, <8 x i16>* %vp
1107  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
1108  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
1109  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
1110  ret <8 x i16> %res
1111}
1112
1113define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %mask) {
1114; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
1115; CHECK:       # %bb.0:
1116; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
1117; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7]
1118; CHECK-NEXT:    retq
1119  %vec = load <8 x i16>, <8 x i16>* %vp
1120  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
1121  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
1122  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
1123  ret <8 x i16> %res
1124}
1125
1126define <8 x i16> @test_8xi16_perm_high_mem_mask6(<8 x i16>* %vp) {
1127; CHECK-LABEL: test_8xi16_perm_high_mem_mask6:
1128; CHECK:       # %bb.0:
1129; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,4,4,4]
1130; CHECK-NEXT:    retq
1131  %vec = load <8 x i16>, <8 x i16>* %vp
1132  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
1133  ret <8 x i16> %res
1134}
1135define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
1136; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask6:
1137; CHECK:       # %bb.0:
1138; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
1139; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4]
1140; CHECK-NEXT:    retq
1141  %vec = load <8 x i16>, <8 x i16>* %vp
1142  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
1143  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
1144  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
1145  ret <8 x i16> %res
1146}
1147
1148define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %mask) {
1149; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
1150; CHECK:       # %bb.0:
1151; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
1152; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4]
1153; CHECK-NEXT:    retq
1154  %vec = load <8 x i16>, <8 x i16>* %vp
1155  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
1156  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
1157  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
1158  ret <8 x i16> %res
1159}
1160
1161define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
1162; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask7:
1163; CHECK:       # %bb.0:
1164; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
1165; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7]
1166; CHECK-NEXT:    retq
1167  %vec = load <8 x i16>, <8 x i16>* %vp
1168  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 1, i32 4, i32 5, i32 6, i32 7>
1169  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
1170  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
1171  ret <8 x i16> %res
1172}
1173
1174define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %mask) {
1175; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
1176; CHECK:       # %bb.0:
1177; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
1178; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7]
1179; CHECK-NEXT:    retq
1180  %vec = load <8 x i16>, <8 x i16>* %vp
1181  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 1, i32 4, i32 5, i32 6, i32 7>
1182  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
1183  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
1184  ret <8 x i16> %res
1185}
1186
1187define <16 x i16> @test_16xi16_perm_high_mask0(<16 x i16> %vec) {
1188; CHECK-LABEL: test_16xi16_perm_high_mask0:
1189; CHECK:       # %bb.0:
1190; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
1191; CHECK-NEXT:    retq
1192  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
1193  ret <16 x i16> %res
1194}
1195define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
1196; CHECK-LABEL: test_masked_16xi16_perm_high_mask0:
1197; CHECK:       # %bb.0:
1198; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
1199; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
1200; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1201; CHECK-NEXT:    retq
1202  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
1203  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1204  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
1205  ret <16 x i16> %res
1206}
1207
1208define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %mask) {
1209; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask0:
1210; CHECK:       # %bb.0:
1211; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
1212; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
1213; CHECK-NEXT:    retq
1214  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
1215  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1216  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
1217  ret <16 x i16> %res
1218}
1219define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
1220; CHECK-LABEL: test_masked_16xi16_perm_low_mask1:
1221; CHECK:       # %bb.0:
1222; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
1223; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15]
1224; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1225; CHECK-NEXT:    retq
1226  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
1227  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1228  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
1229  ret <16 x i16> %res
1230}
1231
1232define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %mask) {
1233; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask1:
1234; CHECK:       # %bb.0:
1235; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
1236; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15]
1237; CHECK-NEXT:    retq
1238  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
1239  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1240  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
1241  ret <16 x i16> %res
1242}
1243define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
1244; CHECK-LABEL: test_masked_16xi16_perm_high_mask2:
1245; CHECK:       # %bb.0:
1246; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
1247; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13]
1248; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1249; CHECK-NEXT:    retq
1250  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
1251  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1252  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
1253  ret <16 x i16> %res
1254}
1255
1256define <16 x i16> @test_masked_z_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %mask) {
1257; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask2:
1258; CHECK:       # %bb.0:
1259; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
1260; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13]
1261; CHECK-NEXT:    retq
1262  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
1263  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1264  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
1265  ret <16 x i16> %res
1266}
1267define <16 x i16> @test_16xi16_perm_low_mask3(<16 x i16> %vec) {
1268; CHECK-LABEL: test_16xi16_perm_low_mask3:
1269; CHECK:       # %bb.0:
1270; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
1271; CHECK-NEXT:    retq
1272  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
1273  ret <16 x i16> %res
1274}
1275define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
1276; CHECK-LABEL: test_masked_16xi16_perm_low_mask3:
1277; CHECK:       # %bb.0:
1278; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
1279; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
1280; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1281; CHECK-NEXT:    retq
1282  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
1283  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1284  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
1285  ret <16 x i16> %res
1286}
1287
1288define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %mask) {
1289; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask3:
1290; CHECK:       # %bb.0:
1291; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
1292; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
1293; CHECK-NEXT:    retq
1294  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
1295  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1296  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
1297  ret <16 x i16> %res
1298}
1299define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
1300; CHECK-LABEL: test_masked_16xi16_perm_high_mask4:
1301; CHECK:       # %bb.0:
1302; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
1303; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
1304; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1305; CHECK-NEXT:    retq
1306  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
1307  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1308  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
1309  ret <16 x i16> %res
1310}
1311
1312define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %mask) {
1313; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask4:
1314; CHECK:       # %bb.0:
1315; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
1316; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
1317; CHECK-NEXT:    retq
1318  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
1319  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1320  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
1321  ret <16 x i16> %res
1322}
1323define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
1324; CHECK-LABEL: test_masked_16xi16_perm_low_mask5:
1325; CHECK:       # %bb.0:
1326; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
1327; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15]
1328; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1329; CHECK-NEXT:    retq
1330  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
1331  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1332  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
1333  ret <16 x i16> %res
1334}
1335
1336define <16 x i16> @test_masked_z_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %mask) {
1337; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask5:
1338; CHECK:       # %bb.0:
1339; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
1340; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15]
1341; CHECK-NEXT:    retq
1342  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
1343  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1344  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
1345  ret <16 x i16> %res
1346}
1347define <16 x i16> @test_16xi16_perm_high_mask6(<16 x i16> %vec) {
1348; CHECK-LABEL: test_16xi16_perm_high_mask6:
1349; CHECK:       # %bb.0:
1350; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
1351; CHECK-NEXT:    retq
1352  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
1353  ret <16 x i16> %res
1354}
1355define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
1356; CHECK-LABEL: test_masked_16xi16_perm_high_mask6:
1357; CHECK:       # %bb.0:
1358; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
1359; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
1360; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1361; CHECK-NEXT:    retq
1362  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
1363  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1364  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
1365  ret <16 x i16> %res
1366}
1367
1368define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %mask) {
1369; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask6:
1370; CHECK:       # %bb.0:
1371; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
1372; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
1373; CHECK-NEXT:    retq
1374  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
1375  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1376  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
1377  ret <16 x i16> %res
1378}
1379define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
1380; CHECK-LABEL: test_masked_16xi16_perm_low_mask7:
1381; CHECK:       # %bb.0:
1382; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
1383; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15]
1384; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1385; CHECK-NEXT:    retq
1386  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
1387  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1388  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
1389  ret <16 x i16> %res
1390}
1391
1392define <16 x i16> @test_masked_z_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %mask) {
1393; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask7:
1394; CHECK:       # %bb.0:
1395; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
1396; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15]
1397; CHECK-NEXT:    retq
1398  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
1399  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1400  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
1401  ret <16 x i16> %res
1402}
1403define <16 x i16> @test_16xi16_perm_high_mem_mask0(<16 x i16>* %vp) {
1404; CHECK-LABEL: test_16xi16_perm_high_mem_mask0:
1405; CHECK:       # %bb.0:
1406; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
1407; CHECK-NEXT:    retq
1408  %vec = load <16 x i16>, <16 x i16>* %vp
1409  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
1410  ret <16 x i16> %res
1411}
1412define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
1413; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask0:
1414; CHECK:       # %bb.0:
1415; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
1416; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
1417; CHECK-NEXT:    retq
1418  %vec = load <16 x i16>, <16 x i16>* %vp
1419  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
1420  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1421  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
1422  ret <16 x i16> %res
1423}
1424
1425define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
1426; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
1427; CHECK:       # %bb.0:
1428; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
1429; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
1430; CHECK-NEXT:    retq
1431  %vec = load <16 x i16>, <16 x i16>* %vp
1432  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
1433  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1434  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
1435  ret <16 x i16> %res
1436}
1437
1438define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
1439; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask1:
1440; CHECK:       # %bb.0:
1441; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
1442; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15]
1443; CHECK-NEXT:    retq
1444  %vec = load <16 x i16>, <16 x i16>* %vp
1445  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
1446  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1447  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
1448  ret <16 x i16> %res
1449}
1450
1451define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
1452; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
1453; CHECK:       # %bb.0:
1454; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
1455; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15]
1456; CHECK-NEXT:    retq
1457  %vec = load <16 x i16>, <16 x i16>* %vp
1458  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
1459  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1460  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
1461  ret <16 x i16> %res
1462}
1463
1464define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
1465; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask2:
1466; CHECK:       # %bb.0:
1467; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
1468; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14]
1469; CHECK-NEXT:    retq
1470  %vec = load <16 x i16>, <16 x i16>* %vp
1471  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 13, i32 14>
1472  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1473  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
1474  ret <16 x i16> %res
1475}
1476
1477define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
1478; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
1479; CHECK:       # %bb.0:
1480; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
1481; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14]
1482; CHECK-NEXT:    retq
1483  %vec = load <16 x i16>, <16 x i16>* %vp
1484  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 13, i32 14>
1485  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1486  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
1487  ret <16 x i16> %res
1488}
1489
1490define <16 x i16> @test_16xi16_perm_low_mem_mask3(<16 x i16>* %vp) {
1491; CHECK-LABEL: test_16xi16_perm_low_mem_mask3:
1492; CHECK:       # %bb.0:
1493; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
1494; CHECK-NEXT:    retq
1495  %vec = load <16 x i16>, <16 x i16>* %vp
1496  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
1497  ret <16 x i16> %res
1498}
1499define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
1500; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask3:
1501; CHECK:       # %bb.0:
1502; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
1503; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
1504; CHECK-NEXT:    retq
1505  %vec = load <16 x i16>, <16 x i16>* %vp
1506  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
1507  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1508  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
1509  ret <16 x i16> %res
1510}
1511
1512define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
1513; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
1514; CHECK:       # %bb.0:
1515; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
1516; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
1517; CHECK-NEXT:    retq
1518  %vec = load <16 x i16>, <16 x i16>* %vp
1519  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
1520  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1521  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
1522  ret <16 x i16> %res
1523}
1524
1525define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
1526; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask4:
1527; CHECK:       # %bb.0:
1528; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
1529; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
1530; CHECK-NEXT:    retq
1531  %vec = load <16 x i16>, <16 x i16>* %vp
1532  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 14, i32 15>
1533  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1534  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
1535  ret <16 x i16> %res
1536}
1537
1538define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %mask) {
1539; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
1540; CHECK:       # %bb.0:
1541; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
1542; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
1543; CHECK-NEXT:    retq
1544  %vec = load <16 x i16>, <16 x i16>* %vp
1545  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 14, i32 15>
1546  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1547  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
1548  ret <16 x i16> %res
1549}
1550
1551define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
1552; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask5:
1553; CHECK:       # %bb.0:
1554; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
1555; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15]
1556; CHECK-NEXT:    retq
1557  %vec = load <16 x i16>, <16 x i16>* %vp
1558  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
1559  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1560  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
1561  ret <16 x i16> %res
1562}
1563
1564define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %mask) {
1565; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
1566; CHECK:       # %bb.0:
1567; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
1568; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15]
1569; CHECK-NEXT:    retq
1570  %vec = load <16 x i16>, <16 x i16>* %vp
1571  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
1572  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1573  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
1574  ret <16 x i16> %res
1575}
1576
1577define <16 x i16> @test_16xi16_perm_high_mem_mask6(<16 x i16>* %vp) {
1578; CHECK-LABEL: test_16xi16_perm_high_mem_mask6:
1579; CHECK:       # %bb.0:
1580; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
1581; CHECK-NEXT:    retq
1582  %vec = load <16 x i16>, <16 x i16>* %vp
1583  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
1584  ret <16 x i16> %res
1585}
1586define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
1587; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask6:
1588; CHECK:       # %bb.0:
1589; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
1590; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
1591; CHECK-NEXT:    retq
1592  %vec = load <16 x i16>, <16 x i16>* %vp
1593  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
1594  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1595  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
1596  ret <16 x i16> %res
1597}
1598
1599define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %mask) {
1600; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
1601; CHECK:       # %bb.0:
1602; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
1603; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
1604; CHECK-NEXT:    retq
1605  %vec = load <16 x i16>, <16 x i16>* %vp
1606  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
1607  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1608  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
1609  ret <16 x i16> %res
1610}
1611
1612define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
1613; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask7:
1614; CHECK:       # %bb.0:
1615; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
1616; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15]
1617; CHECK-NEXT:    retq
1618  %vec = load <16 x i16>, <16 x i16>* %vp
1619  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
1620  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1621  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
1622  ret <16 x i16> %res
1623}
1624
1625define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %mask) {
1626; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
1627; CHECK:       # %bb.0:
1628; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
1629; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15]
1630; CHECK-NEXT:    retq
1631  %vec = load <16 x i16>, <16 x i16>* %vp
1632  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
1633  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
1634  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
1635  ret <16 x i16> %res
1636}
1637
1638define <32 x i16> @test_32xi16_perm_high_mask0(<32 x i16> %vec) {
1639; CHECK-LABEL: test_32xi16_perm_high_mask0:
1640; CHECK:       # %bb.0:
1641; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
1642; CHECK-NEXT:    retq
1643  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
1644  ret <32 x i16> %res
1645}
1646define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
1647; CHECK-LABEL: test_masked_32xi16_perm_high_mask0:
1648; CHECK:       # %bb.0:
1649; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
1650; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
1651; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1652; CHECK-NEXT:    retq
1653  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
1654  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1655  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
1656  ret <32 x i16> %res
1657}
1658
1659define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %mask) {
1660; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask0:
1661; CHECK:       # %bb.0:
1662; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
1663; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
1664; CHECK-NEXT:    retq
1665  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
1666  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1667  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
1668  ret <32 x i16> %res
1669}
1670define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
1671; CHECK-LABEL: test_masked_32xi16_perm_low_mask1:
1672; CHECK:       # %bb.0:
1673; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
1674; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31]
1675; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1676; CHECK-NEXT:    retq
1677  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
1678  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1679  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
1680  ret <32 x i16> %res
1681}
1682
1683define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %mask) {
1684; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask1:
1685; CHECK:       # %bb.0:
1686; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
1687; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31]
1688; CHECK-NEXT:    retq
1689  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
1690  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1691  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
1692  ret <32 x i16> %res
1693}
1694define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
1695; CHECK-LABEL: test_masked_32xi16_perm_high_mask2:
1696; CHECK:       # %bb.0:
1697; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
1698; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31]
1699; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1700; CHECK-NEXT:    retq
1701  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
1702  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1703  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
1704  ret <32 x i16> %res
1705}
1706
1707define <32 x i16> @test_masked_z_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %mask) {
1708; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask2:
1709; CHECK:       # %bb.0:
1710; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
1711; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31]
1712; CHECK-NEXT:    retq
1713  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
1714  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1715  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
1716  ret <32 x i16> %res
1717}
1718define <32 x i16> @test_32xi16_perm_low_mask3(<32 x i16> %vec) {
1719; CHECK-LABEL: test_32xi16_perm_low_mask3:
1720; CHECK:       # %bb.0:
1721; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
1722; CHECK-NEXT:    retq
1723  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
1724  ret <32 x i16> %res
1725}
1726define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
1727; CHECK-LABEL: test_masked_32xi16_perm_low_mask3:
1728; CHECK:       # %bb.0:
1729; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
1730; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
1731; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1732; CHECK-NEXT:    retq
1733  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
1734  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1735  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
1736  ret <32 x i16> %res
1737}
1738
1739define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %mask) {
1740; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask3:
1741; CHECK:       # %bb.0:
1742; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
1743; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
1744; CHECK-NEXT:    retq
1745  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
1746  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1747  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
1748  ret <32 x i16> %res
1749}
1750define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
1751; CHECK-LABEL: test_masked_32xi16_perm_high_mask4:
1752; CHECK:       # %bb.0:
1753; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
1754; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30]
1755; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1756; CHECK-NEXT:    retq
1757  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
1758  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1759  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
1760  ret <32 x i16> %res
1761}
1762
1763define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %mask) {
1764; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask4:
1765; CHECK:       # %bb.0:
1766; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
1767; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30]
1768; CHECK-NEXT:    retq
1769  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
1770  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1771  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
1772  ret <32 x i16> %res
1773}
1774define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
1775; CHECK-LABEL: test_masked_32xi16_perm_low_mask5:
1776; CHECK:       # %bb.0:
1777; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
1778; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31]
1779; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1780; CHECK-NEXT:    retq
1781  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
1782  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1783  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
1784  ret <32 x i16> %res
1785}
1786
1787define <32 x i16> @test_masked_z_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %mask) {
1788; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask5:
1789; CHECK:       # %bb.0:
1790; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
1791; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31]
1792; CHECK-NEXT:    retq
1793  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
1794  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1795  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
1796  ret <32 x i16> %res
1797}
1798define <32 x i16> @test_32xi16_perm_high_mask6(<32 x i16> %vec) {
1799; CHECK-LABEL: test_32xi16_perm_high_mask6:
1800; CHECK:       # %bb.0:
1801; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
1802; CHECK-NEXT:    retq
1803  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
1804  ret <32 x i16> %res
1805}
1806define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
1807; CHECK-LABEL: test_masked_32xi16_perm_high_mask6:
1808; CHECK:       # %bb.0:
1809; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
1810; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
1811; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1812; CHECK-NEXT:    retq
1813  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
1814  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1815  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
1816  ret <32 x i16> %res
1817}
1818
1819define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %mask) {
1820; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask6:
1821; CHECK:       # %bb.0:
1822; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
1823; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
1824; CHECK-NEXT:    retq
1825  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
1826  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1827  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
1828  ret <32 x i16> %res
1829}
1830define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
1831; CHECK-LABEL: test_masked_32xi16_perm_low_mask7:
1832; CHECK:       # %bb.0:
1833; CHECK-NEXT:    vptestnmw %zmm2, %zmm2, %k1
1834; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31]
1835; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1836; CHECK-NEXT:    retq
1837  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
1838  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1839  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
1840  ret <32 x i16> %res
1841}
1842
1843define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %mask) {
1844; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask7:
1845; CHECK:       # %bb.0:
1846; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
1847; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31]
1848; CHECK-NEXT:    retq
1849  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
1850  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1851  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
1852  ret <32 x i16> %res
1853}
1854define <32 x i16> @test_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) {
1855; CHECK-LABEL: test_32xi16_perm_high_mem_mask0:
1856; CHECK:       # %bb.0:
1857; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
1858; CHECK-NEXT:    retq
1859  %vec = load <32 x i16>, <32 x i16>* %vp
1860  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
1861  ret <32 x i16> %res
1862}
1863define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
1864; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask0:
1865; CHECK:       # %bb.0:
1866; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
1867; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
1868; CHECK-NEXT:    retq
1869  %vec = load <32 x i16>, <32 x i16>* %vp
1870  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
1871  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1872  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
1873  ret <32 x i16> %res
1874}
1875
1876define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
1877; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
1878; CHECK:       # %bb.0:
1879; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
1880; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
1881; CHECK-NEXT:    retq
1882  %vec = load <32 x i16>, <32 x i16>* %vp
1883  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
1884  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1885  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
1886  ret <32 x i16> %res
1887}
1888
1889define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
1890; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask1:
1891; CHECK:       # %bb.0:
1892; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
1893; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31]
1894; CHECK-NEXT:    retq
1895  %vec = load <32 x i16>, <32 x i16>* %vp
1896  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 19, i32 19, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 27, i32 27, i32 28, i32 29, i32 30, i32 31>
1897  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1898  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
1899  ret <32 x i16> %res
1900}
1901
1902define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
1903; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
1904; CHECK:       # %bb.0:
1905; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
1906; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31]
1907; CHECK-NEXT:    retq
1908  %vec = load <32 x i16>, <32 x i16>* %vp
1909  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 19, i32 19, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 27, i32 27, i32 28, i32 29, i32 30, i32 31>
1910  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1911  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
1912  ret <32 x i16> %res
1913}
1914
1915define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
1916; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask2:
1917; CHECK:       # %bb.0:
1918; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
1919; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28]
1920; CHECK-NEXT:    retq
1921  %vec = load <32 x i16>, <32 x i16>* %vp
1922  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 7, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 15, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 23, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 31, i32 30, i32 28>
1923  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1924  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
1925  ret <32 x i16> %res
1926}
1927
1928define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
1929; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
1930; CHECK:       # %bb.0:
1931; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
1932; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28]
1933; CHECK-NEXT:    retq
1934  %vec = load <32 x i16>, <32 x i16>* %vp
1935  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 7, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 15, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 23, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 31, i32 30, i32 28>
1936  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1937  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
1938  ret <32 x i16> %res
1939}
1940
1941define <32 x i16> @test_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) {
1942; CHECK-LABEL: test_32xi16_perm_low_mem_mask3:
1943; CHECK:       # %bb.0:
1944; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
1945; CHECK-NEXT:    retq
1946  %vec = load <32 x i16>, <32 x i16>* %vp
1947  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
1948  ret <32 x i16> %res
1949}
1950define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
1951; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask3:
1952; CHECK:       # %bb.0:
1953; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
1954; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
1955; CHECK-NEXT:    retq
1956  %vec = load <32 x i16>, <32 x i16>* %vp
1957  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
1958  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1959  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
1960  ret <32 x i16> %res
1961}
1962
1963define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
1964; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
1965; CHECK:       # %bb.0:
1966; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
1967; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
1968; CHECK-NEXT:    retq
1969  %vec = load <32 x i16>, <32 x i16>* %vp
1970  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
1971  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1972  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
1973  ret <32 x i16> %res
1974}
1975
1976define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
1977; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask4:
1978; CHECK:       # %bb.0:
1979; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
1980; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29]
1981; CHECK-NEXT:    retq
1982  %vec = load <32 x i16>, <32 x i16>* %vp
1983  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 14, i32 13, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 22, i32 21, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 30, i32 29>
1984  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1985  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
1986  ret <32 x i16> %res
1987}
1988
1989define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %mask) {
1990; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
1991; CHECK:       # %bb.0:
1992; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
1993; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29]
1994; CHECK-NEXT:    retq
1995  %vec = load <32 x i16>, <32 x i16>* %vp
1996  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 14, i32 13, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 22, i32 21, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 30, i32 29>
1997  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
1998  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
1999  ret <32 x i16> %res
2000}
2001
2002define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
2003; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask5:
2004; CHECK:       # %bb.0:
2005; CHECK-NEXT:    vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15]
2006; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
2007; CHECK-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1}
2008; CHECK-NEXT:    retq
2009  %vec = load <32 x i16>, <32 x i16>* %vp
2010  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
2011  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
2012  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
2013  ret <32 x i16> %res
2014}
2015
2016define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %mask) {
2017; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
2018; CHECK:       # %bb.0:
2019; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15]
2020; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
2021; CHECK-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z}
2022; CHECK-NEXT:    retq
2023  %vec = load <32 x i16>, <32 x i16>* %vp
2024  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
2025  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
2026  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
2027  ret <32 x i16> %res
2028}
2029
2030define <32 x i16> @test_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) {
2031; CHECK-LABEL: test_32xi16_perm_high_mem_mask6:
2032; CHECK:       # %bb.0:
2033; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
2034; CHECK-NEXT:    retq
2035  %vec = load <32 x i16>, <32 x i16>* %vp
2036  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
2037  ret <32 x i16> %res
2038}
2039define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
2040; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask6:
2041; CHECK:       # %bb.0:
2042; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
2043; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
2044; CHECK-NEXT:    retq
2045  %vec = load <32 x i16>, <32 x i16>* %vp
2046  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
2047  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
2048  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
2049  ret <32 x i16> %res
2050}
2051
2052define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %mask) {
2053; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
2054; CHECK:       # %bb.0:
2055; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
2056; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
2057; CHECK-NEXT:    retq
2058  %vec = load <32 x i16>, <32 x i16>* %vp
2059  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
2060  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
2061  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
2062  ret <32 x i16> %res
2063}
2064
2065define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
2066; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask7:
2067; CHECK:       # %bb.0:
2068; CHECK-NEXT:    vptestnmw %zmm1, %zmm1, %k1
2069; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31]
2070; CHECK-NEXT:    retq
2071  %vec = load <32 x i16>, <32 x i16>* %vp
2072  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 1, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 17, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 25, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
2073  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
2074  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
2075  ret <32 x i16> %res
2076}
2077
2078define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %mask) {
2079; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
2080; CHECK:       # %bb.0:
2081; CHECK-NEXT:    vptestnmw %zmm0, %zmm0, %k1
2082; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31]
2083; CHECK-NEXT:    retq
2084  %vec = load <32 x i16>, <32 x i16>* %vp
2085  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 1, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 17, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 25, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
2086  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
2087  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
2088  ret <32 x i16> %res
2089}
2090
2091define <4 x i32> @test_4xi32_perm_mask0(<4 x i32> %vec) {
2092; CHECK-LABEL: test_4xi32_perm_mask0:
2093; CHECK:       # %bb.0:
2094; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0]
2095; CHECK-NEXT:    retq
2096  %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
2097  ret <4 x i32> %res
2098}
2099define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
2100; CHECK-LABEL: test_masked_4xi32_perm_mask0:
2101; CHECK:       # %bb.0:
2102; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
2103; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0]
2104; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
2105; CHECK-NEXT:    retq
2106  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
2107  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
2108  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
2109  ret <4 x i32> %res
2110}
2111
2112define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask) {
2113; CHECK-LABEL: test_masked_z_4xi32_perm_mask0:
2114; CHECK:       # %bb.0:
2115; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
2116; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0]
2117; CHECK-NEXT:    retq
2118  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
2119  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
2120  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
2121  ret <4 x i32> %res
2122}
2123define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
2124; CHECK-LABEL: test_masked_4xi32_perm_mask1:
2125; CHECK:       # %bb.0:
2126; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
2127; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0]
2128; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
2129; CHECK-NEXT:    retq
2130  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
2131  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
2132  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
2133  ret <4 x i32> %res
2134}
2135
2136define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask) {
2137; CHECK-LABEL: test_masked_z_4xi32_perm_mask1:
2138; CHECK:       # %bb.0:
2139; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
2140; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0]
2141; CHECK-NEXT:    retq
2142  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
2143  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
2144  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
2145  ret <4 x i32> %res
2146}
2147define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
2148; CHECK-LABEL: test_masked_4xi32_perm_mask2:
2149; CHECK:       # %bb.0:
2150; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
2151; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0]
2152; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
2153; CHECK-NEXT:    retq
2154  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
2155  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
2156  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
2157  ret <4 x i32> %res
2158}
2159
2160define <4 x i32> @test_masked_z_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %mask) {
2161; CHECK-LABEL: test_masked_z_4xi32_perm_mask2:
2162; CHECK:       # %bb.0:
2163; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
2164; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0]
2165; CHECK-NEXT:    retq
2166  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
2167  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
2168  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
2169  ret <4 x i32> %res
2170}
2171define <4 x i32> @test_4xi32_perm_mask3(<4 x i32> %vec) {
2172; CHECK-LABEL: test_4xi32_perm_mask3:
2173; CHECK:       # %bb.0:
2174; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3]
2175; CHECK-NEXT:    retq
2176  %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
2177  ret <4 x i32> %res
2178}
2179define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
2180; CHECK-LABEL: test_masked_4xi32_perm_mask3:
2181; CHECK:       # %bb.0:
2182; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
2183; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3]
2184; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
2185; CHECK-NEXT:    retq
2186  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
2187  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
2188  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
2189  ret <4 x i32> %res
2190}
2191
2192define <4 x i32> @test_masked_z_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %mask) {
2193; CHECK-LABEL: test_masked_z_4xi32_perm_mask3:
2194; CHECK:       # %bb.0:
2195; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
2196; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3]
2197; CHECK-NEXT:    retq
2198  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
2199  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
2200  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
2201  ret <4 x i32> %res
2202}
2203define <4 x i32> @test_4xi32_perm_mem_mask0(<4 x i32>* %vp) {
2204; CHECK-LABEL: test_4xi32_perm_mem_mask0:
2205; CHECK:       # %bb.0:
2206; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,3,3]
2207; CHECK-NEXT:    retq
2208  %vec = load <4 x i32>, <4 x i32>* %vp
2209  %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
2210  ret <4 x i32> %res
2211}
2212define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
2213; CHECK-LABEL: test_masked_4xi32_perm_mem_mask0:
2214; CHECK:       # %bb.0:
2215; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
2216; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3]
2217; CHECK-NEXT:    retq
2218  %vec = load <4 x i32>, <4 x i32>* %vp
2219  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
2220  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
2221  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
2222  ret <4 x i32> %res
2223}
2224
2225define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %mask) {
2226; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask0:
2227; CHECK:       # %bb.0:
2228; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
2229; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3]
2230; CHECK-NEXT:    retq
2231  %vec = load <4 x i32>, <4 x i32>* %vp
2232  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
2233  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
2234  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
2235  ret <4 x i32> %res
2236}
2237
2238define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
2239; CHECK-LABEL: test_masked_4xi32_perm_mem_mask1:
2240; CHECK:       # %bb.0:
2241; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
2242; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1]
2243; CHECK-NEXT:    retq
2244  %vec = load <4 x i32>, <4 x i32>* %vp
2245  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 1>
2246  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
2247  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
2248  ret <4 x i32> %res
2249}
2250
2251define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %mask) {
2252; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask1:
2253; CHECK:       # %bb.0:
2254; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
2255; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1]
2256; CHECK-NEXT:    retq
2257  %vec = load <4 x i32>, <4 x i32>* %vp
2258  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 1>
2259  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
2260  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
2261  ret <4 x i32> %res
2262}
2263
2264define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
2265; CHECK-LABEL: test_masked_4xi32_perm_mem_mask2:
2266; CHECK:       # %bb.0:
2267; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
2268; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1]
2269; CHECK-NEXT:    retq
2270  %vec = load <4 x i32>, <4 x i32>* %vp
2271  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 0, i32 1>
2272  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
2273  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
2274  ret <4 x i32> %res
2275}
2276
2277define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %mask) {
2278; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask2:
2279; CHECK:       # %bb.0:
2280; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
2281; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1]
2282; CHECK-NEXT:    retq
2283  %vec = load <4 x i32>, <4 x i32>* %vp
2284  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 0, i32 1>
2285  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
2286  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
2287  ret <4 x i32> %res
2288}
2289
2290define <4 x i32> @test_4xi32_perm_mem_mask3(<4 x i32>* %vp) {
2291; CHECK-LABEL: test_4xi32_perm_mem_mask3:
2292; CHECK:       # %bb.0:
2293; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,0,1,0]
2294; CHECK-NEXT:    retq
2295  %vec = load <4 x i32>, <4 x i32>* %vp
2296  %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
2297  ret <4 x i32> %res
2298}
2299define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
2300; CHECK-LABEL: test_masked_4xi32_perm_mem_mask3:
2301; CHECK:       # %bb.0:
2302; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
2303; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0]
2304; CHECK-NEXT:    retq
2305  %vec = load <4 x i32>, <4 x i32>* %vp
2306  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
2307  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
2308  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
2309  ret <4 x i32> %res
2310}
2311
2312define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %mask) {
2313; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask3:
2314; CHECK:       # %bb.0:
2315; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
2316; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0]
2317; CHECK-NEXT:    retq
2318  %vec = load <4 x i32>, <4 x i32>* %vp
2319  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
2320  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
2321  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
2322  ret <4 x i32> %res
2323}
2324
2325define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) {
2326; CHECK-LABEL: test_8xi32_perm_mask0:
2327; CHECK:       # %bb.0:
2328; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4]
2329; CHECK-NEXT:    retq
2330  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
2331  ret <8 x i32> %res
2332}
2333define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
2334; CHECK-LABEL: test_masked_8xi32_perm_mask0:
2335; CHECK:       # %bb.0:
2336; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
2337; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4]
2338; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
2339; CHECK-NEXT:    retq
2340  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
2341  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2342  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
2343  ret <8 x i32> %res
2344}
2345
2346define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
2347; CHECK-LABEL: test_masked_z_8xi32_perm_mask0:
2348; CHECK:       # %bb.0:
2349; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
2350; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4]
2351; CHECK-NEXT:    retq
2352  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
2353  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2354  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
2355  ret <8 x i32> %res
2356}
2357define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
2358; CHECK-LABEL: test_masked_8xi32_perm_mask1:
2359; CHECK:       # %bb.0:
2360; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
2361; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7]
2362; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
2363; CHECK-NEXT:    retq
2364  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
2365  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2366  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
2367  ret <8 x i32> %res
2368}
2369
2370define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
2371; CHECK-LABEL: test_masked_z_8xi32_perm_mask1:
2372; CHECK:       # %bb.0:
2373; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
2374; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7]
2375; CHECK-NEXT:    retq
2376  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
2377  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2378  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
2379  ret <8 x i32> %res
2380}
2381define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
2382; CHECK-LABEL: test_masked_8xi32_perm_mask2:
2383; CHECK:       # %bb.0:
2384; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
2385; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7]
2386; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
2387; CHECK-NEXT:    retq
2388  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
2389  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2390  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
2391  ret <8 x i32> %res
2392}
2393
2394define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
2395; CHECK-LABEL: test_masked_z_8xi32_perm_mask2:
2396; CHECK:       # %bb.0:
2397; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
2398; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7]
2399; CHECK-NEXT:    retq
2400  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
2401  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2402  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
2403  ret <8 x i32> %res
2404}
2405define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) {
2406; CHECK-LABEL: test_8xi32_perm_mask3:
2407; CHECK:       # %bb.0:
2408; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4]
2409; CHECK-NEXT:    retq
2410  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
2411  ret <8 x i32> %res
2412}
2413define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
2414; CHECK-LABEL: test_masked_8xi32_perm_mask3:
2415; CHECK:       # %bb.0:
2416; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
2417; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4]
2418; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
2419; CHECK-NEXT:    retq
2420  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
2421  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2422  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
2423  ret <8 x i32> %res
2424}
2425
2426define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
2427; CHECK-LABEL: test_masked_z_8xi32_perm_mask3:
2428; CHECK:       # %bb.0:
2429; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
2430; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4]
2431; CHECK-NEXT:    retq
2432  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
2433  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2434  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
2435  ret <8 x i32> %res
2436}
2437define <8 x i32> @test_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
2438; CHECK-LABEL: test_8xi32_perm_mem_mask0:
2439; CHECK:       # %bb.0:
2440; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4]
2441; CHECK-NEXT:    retq
2442  %vec = load <8 x i32>, <8 x i32>* %vp
2443  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
2444  ret <8 x i32> %res
2445}
2446define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
2447; CHECK-LABEL: test_masked_8xi32_perm_mem_mask0:
2448; CHECK:       # %bb.0:
2449; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
2450; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4]
2451; CHECK-NEXT:    retq
2452  %vec = load <8 x i32>, <8 x i32>* %vp
2453  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
2454  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2455  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
2456  ret <8 x i32> %res
2457}
2458
2459define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
2460; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask0:
2461; CHECK:       # %bb.0:
2462; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
2463; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4]
2464; CHECK-NEXT:    retq
2465  %vec = load <8 x i32>, <8 x i32>* %vp
2466  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
2467  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2468  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
2469  ret <8 x i32> %res
2470}
2471
2472define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
2473; CHECK-LABEL: test_masked_8xi32_perm_mem_mask1:
2474; CHECK:       # %bb.0:
2475; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
2476; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4]
2477; CHECK-NEXT:    retq
2478  %vec = load <8 x i32>, <8 x i32>* %vp
2479  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
2480  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2481  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
2482  ret <8 x i32> %res
2483}
2484
2485define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
2486; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask1:
2487; CHECK:       # %bb.0:
2488; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
2489; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4]
2490; CHECK-NEXT:    retq
2491  %vec = load <8 x i32>, <8 x i32>* %vp
2492  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
2493  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2494  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
2495  ret <8 x i32> %res
2496}
2497
2498define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
2499; CHECK-LABEL: test_masked_8xi32_perm_mem_mask2:
2500; CHECK:       # %bb.0:
2501; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
2502; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5]
2503; CHECK-NEXT:    retq
2504  %vec = load <8 x i32>, <8 x i32>* %vp
2505  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 1, i32 7, i32 6, i32 7, i32 5>
2506  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2507  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
2508  ret <8 x i32> %res
2509}
2510
2511define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
2512; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask2:
2513; CHECK:       # %bb.0:
2514; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
2515; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5]
2516; CHECK-NEXT:    retq
2517  %vec = load <8 x i32>, <8 x i32>* %vp
2518  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 1, i32 7, i32 6, i32 7, i32 5>
2519  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2520  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
2521  ret <8 x i32> %res
2522}
2523
2524define <8 x i32> @test_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
2525; CHECK-LABEL: test_8xi32_perm_mem_mask3:
2526; CHECK:       # %bb.0:
2527; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4]
2528; CHECK-NEXT:    retq
2529  %vec = load <8 x i32>, <8 x i32>* %vp
2530  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
2531  ret <8 x i32> %res
2532}
2533define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
2534; CHECK-LABEL: test_masked_8xi32_perm_mem_mask3:
2535; CHECK:       # %bb.0:
2536; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
2537; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4]
2538; CHECK-NEXT:    retq
2539  %vec = load <8 x i32>, <8 x i32>* %vp
2540  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
2541  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2542  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
2543  ret <8 x i32> %res
2544}
2545
2546define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
2547; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask3:
2548; CHECK:       # %bb.0:
2549; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
2550; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4]
2551; CHECK-NEXT:    retq
2552  %vec = load <8 x i32>, <8 x i32>* %vp
2553  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
2554  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2555  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
2556  ret <8 x i32> %res
2557}
2558
2559define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec) {
2560; CHECK-LABEL: test_16xi32_perm_mask0:
2561; CHECK:       # %bb.0:
2562; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
2563; CHECK-NEXT:    retq
2564  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
2565  ret <16 x i32> %res
2566}
2567define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
2568; CHECK-LABEL: test_masked_16xi32_perm_mask0:
2569; CHECK:       # %bb.0:
2570; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
2571; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
2572; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
2573; CHECK-NEXT:    retq
2574  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
2575  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
2576  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
2577  ret <16 x i32> %res
2578}
2579
2580define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
2581; CHECK-LABEL: test_masked_z_16xi32_perm_mask0:
2582; CHECK:       # %bb.0:
2583; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
2584; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
2585; CHECK-NEXT:    retq
2586  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
2587  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
2588  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
2589  ret <16 x i32> %res
2590}
2591define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
2592; CHECK-LABEL: test_masked_16xi32_perm_mask1:
2593; CHECK:       # %bb.0:
2594; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
2595; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12]
2596; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
2597; CHECK-NEXT:    retq
2598  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
2599  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
2600  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
2601  ret <16 x i32> %res
2602}
2603
2604define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
2605; CHECK-LABEL: test_masked_z_16xi32_perm_mask1:
2606; CHECK:       # %bb.0:
2607; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
2608; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12]
2609; CHECK-NEXT:    retq
2610  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
2611  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
2612  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
2613  ret <16 x i32> %res
2614}
2615define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
2616; CHECK-LABEL: test_masked_16xi32_perm_mask2:
2617; CHECK:       # %bb.0:
2618; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
2619; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12]
2620; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
2621; CHECK-NEXT:    retq
2622  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
2623  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
2624  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
2625  ret <16 x i32> %res
2626}
2627
2628define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
2629; CHECK-LABEL: test_masked_z_16xi32_perm_mask2:
2630; CHECK:       # %bb.0:
2631; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
2632; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12]
2633; CHECK-NEXT:    retq
2634  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
2635  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
2636  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
2637  ret <16 x i32> %res
2638}
2639define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) {
2640; CHECK-LABEL: test_16xi32_perm_mask3:
2641; CHECK:       # %bb.0:
2642; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
2643; CHECK-NEXT:    retq
2644  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
2645  ret <16 x i32> %res
2646}
2647define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
2648; CHECK-LABEL: test_masked_16xi32_perm_mask3:
2649; CHECK:       # %bb.0:
2650; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
2651; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
2652; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
2653; CHECK-NEXT:    retq
2654  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
2655  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
2656  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
2657  ret <16 x i32> %res
2658}
2659
2660define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
2661; CHECK-LABEL: test_masked_z_16xi32_perm_mask3:
2662; CHECK:       # %bb.0:
2663; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
2664; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
2665; CHECK-NEXT:    retq
2666  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
2667  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
2668  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
2669  ret <16 x i32> %res
2670}
2671define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
2672; CHECK-LABEL: test_16xi32_perm_mem_mask0:
2673; CHECK:       # %bb.0:
2674; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
2675; CHECK-NEXT:    retq
2676  %vec = load <16 x i32>, <16 x i32>* %vp
2677  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
2678  ret <16 x i32> %res
2679}
2680define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
2681; CHECK-LABEL: test_masked_16xi32_perm_mem_mask0:
2682; CHECK:       # %bb.0:
2683; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
2684; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
2685; CHECK-NEXT:    retq
2686  %vec = load <16 x i32>, <16 x i32>* %vp
2687  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
2688  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
2689  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
2690  ret <16 x i32> %res
2691}
2692
2693define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
2694; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask0:
2695; CHECK:       # %bb.0:
2696; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
2697; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
2698; CHECK-NEXT:    retq
2699  %vec = load <16 x i32>, <16 x i32>* %vp
2700  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
2701  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
2702  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
2703  ret <16 x i32> %res
2704}
2705
2706define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
2707; CHECK-LABEL: test_masked_16xi32_perm_mem_mask1:
2708; CHECK:       # %bb.0:
2709; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
2710; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14]
2711; CHECK-NEXT:    retq
2712  %vec = load <16 x i32>, <16 x i32>* %vp
2713  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 2, i32 5, i32 4, i32 4, i32 6, i32 9, i32 8, i32 8, i32 10, i32 13, i32 12, i32 12, i32 14>
2714  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
2715  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
2716  ret <16 x i32> %res
2717}
2718
2719define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
2720; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask1:
2721; CHECK:       # %bb.0:
2722; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
2723; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14]
2724; CHECK-NEXT:    retq
2725  %vec = load <16 x i32>, <16 x i32>* %vp
2726  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 2, i32 5, i32 4, i32 4, i32 6, i32 9, i32 8, i32 8, i32 10, i32 13, i32 12, i32 12, i32 14>
2727  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
2728  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
2729  ret <16 x i32> %res
2730}
2731
2732define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
2733; CHECK-LABEL: test_masked_16xi32_perm_mem_mask2:
2734; CHECK:       # %bb.0:
2735; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
2736; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14]
2737; CHECK-NEXT:    retq
2738  %vec = load <16 x i32>, <16 x i32>* %vp
2739  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 1, i32 2, i32 6, i32 4, i32 5, i32 6, i32 10, i32 8, i32 9, i32 10, i32 14, i32 12, i32 13, i32 14>
2740  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
2741  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
2742  ret <16 x i32> %res
2743}
2744
2745define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
2746; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask2:
2747; CHECK:       # %bb.0:
2748; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
2749; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14]
2750; CHECK-NEXT:    retq
2751  %vec = load <16 x i32>, <16 x i32>* %vp
2752  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 1, i32 2, i32 6, i32 4, i32 5, i32 6, i32 10, i32 8, i32 9, i32 10, i32 14, i32 12, i32 13, i32 14>
2753  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
2754  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
2755  ret <16 x i32> %res
2756}
2757
2758define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
2759; CHECK-LABEL: test_16xi32_perm_mem_mask3:
2760; CHECK:       # %bb.0:
2761; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
2762; CHECK-NEXT:    retq
2763  %vec = load <16 x i32>, <16 x i32>* %vp
2764  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
2765  ret <16 x i32> %res
2766}
2767define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
2768; CHECK-LABEL: test_masked_16xi32_perm_mem_mask3:
2769; CHECK:       # %bb.0:
2770; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
2771; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
2772; CHECK-NEXT:    retq
2773  %vec = load <16 x i32>, <16 x i32>* %vp
2774  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
2775  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
2776  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
2777  ret <16 x i32> %res
2778}
2779
2780define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
2781; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask3:
2782; CHECK:       # %bb.0:
2783; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
2784; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
2785; CHECK-NEXT:    retq
2786  %vec = load <16 x i32>, <16 x i32>* %vp
2787  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
2788  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
2789  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
2790  ret <16 x i32> %res
2791}
2792
2793