1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s
3
4; FIXME: 128-bit shuffles of 256-bit vectors cases should be fixed by PR34359
5
6define <8 x float> @test_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2) {
7; CHECK-LABEL: test_8xfloat_shuff_mask0:
8; CHECK:       # %bb.0:
9; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
10; CHECK-NEXT:    retq
11  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
12  ret <8 x float> %res
13}
14define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
15; CHECK-LABEL: test_8xfloat_masked_shuff_mask0:
16; CHECK:       # %bb.0:
17; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
18; CHECK-NEXT:    vcmpeqps %ymm4, %ymm3, %k1
19; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
20; CHECK-NEXT:    vmovaps %ymm2, %ymm0
21; CHECK-NEXT:    retq
22  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
23  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
24  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
25  ret <8 x float> %res
26}
27
28define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
29; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask0:
30; CHECK:       # %bb.0:
31; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
32; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
33; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
34; CHECK-NEXT:    retq
35  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
36  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
37  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
38  ret <8 x float> %res
39}
40define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
41; CHECK-LABEL: test_8xfloat_masked_shuff_mask1:
42; CHECK:       # %bb.0:
43; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
44; CHECK-NEXT:    vcmpeqps %ymm4, %ymm3, %k1
45; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
46; CHECK-NEXT:    vmovaps %ymm2, %ymm0
47; CHECK-NEXT:    retq
48  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
49  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
50  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
51  ret <8 x float> %res
52}
53
54define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
55; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask1:
56; CHECK:       # %bb.0:
57; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
58; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
59; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
60; CHECK-NEXT:    retq
61  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
62  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
63  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
64  ret <8 x float> %res
65}
66define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
67; CHECK-LABEL: test_8xfloat_masked_shuff_mask2:
68; CHECK:       # %bb.0:
69; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
70; CHECK-NEXT:    vcmpeqps %ymm4, %ymm3, %k1
71; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
72; CHECK-NEXT:    vmovaps %ymm2, %ymm0
73; CHECK-NEXT:    retq
74  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
75  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
76  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
77  ret <8 x float> %res
78}
79
80define <8 x float> @test_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
81; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask2:
82; CHECK:       # %bb.0:
83; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
84; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
85; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
86; CHECK-NEXT:    retq
87  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
88  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
89  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
90  ret <8 x float> %res
91}
92define <8 x float> @test_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) {
93; CHECK-LABEL: test_8xfloat_shuff_mask3:
94; CHECK:       # %bb.0:
95; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
96; CHECK-NEXT:    retq
97  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
98  ret <8 x float> %res
99}
100define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
101; CHECK-LABEL: test_8xfloat_masked_shuff_mask3:
102; CHECK:       # %bb.0:
103; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
104; CHECK-NEXT:    vcmpeqps %ymm4, %ymm3, %k1
105; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
106; CHECK-NEXT:    vmovaps %ymm2, %ymm0
107; CHECK-NEXT:    retq
108  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
109  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
110  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
111  ret <8 x float> %res
112}
113
114define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
115; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask3:
116; CHECK:       # %bb.0:
117; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
118; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
119; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
120; CHECK-NEXT:    retq
121  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
122  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
123  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
124  ret <8 x float> %res
125}
126define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
127; CHECK-LABEL: test_8xfloat_shuff_mem_mask0:
128; CHECK:       # %bb.0:
129; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
130; CHECK-NEXT:    retq
131  %vec2 = load <8 x float>, <8 x float>* %vec2p
132  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
133  ret <8 x float> %res
134}
135define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
136; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask0:
137; CHECK:       # %bb.0:
138; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
139; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
140; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
141; CHECK-NEXT:    vmovaps %ymm1, %ymm0
142; CHECK-NEXT:    retq
143  %vec2 = load <8 x float>, <8 x float>* %vec2p
144  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
145  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
146  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
147  ret <8 x float> %res
148}
149
150define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
151; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
152; CHECK:       # %bb.0:
153; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
154; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
155; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
156; CHECK-NEXT:    retq
157  %vec2 = load <8 x float>, <8 x float>* %vec2p
158  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
159  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
160  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
161  ret <8 x float> %res
162}
163
164define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
165; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask1:
166; CHECK:       # %bb.0:
167; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
168; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
169; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
170; CHECK-NEXT:    vmovaps %ymm1, %ymm0
171; CHECK-NEXT:    retq
172  %vec2 = load <8 x float>, <8 x float>* %vec2p
173  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
174  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
175  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
176  ret <8 x float> %res
177}
178
179define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
180; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
181; CHECK:       # %bb.0:
182; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
183; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
184; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
185; CHECK-NEXT:    retq
186  %vec2 = load <8 x float>, <8 x float>* %vec2p
187  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
188  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
189  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
190  ret <8 x float> %res
191}
192
193define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
194; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask2:
195; CHECK:       # %bb.0:
196; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
197; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
198; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
199; CHECK-NEXT:    vmovaps %ymm1, %ymm0
200; CHECK-NEXT:    retq
201  %vec2 = load <8 x float>, <8 x float>* %vec2p
202  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
203  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
204  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
205  ret <8 x float> %res
206}
207
208define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
209; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
210; CHECK:       # %bb.0:
211; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
212; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
213; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
214; CHECK-NEXT:    retq
215  %vec2 = load <8 x float>, <8 x float>* %vec2p
216  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
217  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
218  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
219  ret <8 x float> %res
220}
221
222define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
223; CHECK-LABEL: test_8xfloat_shuff_mem_mask3:
224; CHECK:       # %bb.0:
225; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
226; CHECK-NEXT:    retq
227  %vec2 = load <8 x float>, <8 x float>* %vec2p
228  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
229  ret <8 x float> %res
230}
231define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
232; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask3:
233; CHECK:       # %bb.0:
234; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
235; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
236; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
237; CHECK-NEXT:    vmovaps %ymm1, %ymm0
238; CHECK-NEXT:    retq
239  %vec2 = load <8 x float>, <8 x float>* %vec2p
240  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
241  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
242  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
243  ret <8 x float> %res
244}
245
246define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
247; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
248; CHECK:       # %bb.0:
249; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
250; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
251; CHECK-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
252; CHECK-NEXT:    retq
253  %vec2 = load <8 x float>, <8 x float>* %vec2p
254  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
255  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
256  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
257  ret <8 x float> %res
258}
259
260define <16 x float> @test_16xfloat_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2) {
261; CHECK-LABEL: test_16xfloat_shuff_mask0:
262; CHECK:       # %bb.0:
263; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],zmm1[2,3,6,7]
264; CHECK-NEXT:    retq
265  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
266  ret <16 x float> %res
267}
268define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
269; CHECK-LABEL: test_16xfloat_masked_shuff_mask0:
270; CHECK:       # %bb.0:
271; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
272; CHECK-NEXT:    vcmpeqps %zmm4, %zmm3, %k1
273; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15]
274; CHECK-NEXT:    vmovaps %zmm2, %zmm0
275; CHECK-NEXT:    retq
276  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
277  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
278  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
279  ret <16 x float> %res
280}
281
282define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
283; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask0:
284; CHECK:       # %bb.0:
285; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
286; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
287; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15]
288; CHECK-NEXT:    retq
289  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
290  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
291  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
292  ret <16 x float> %res
293}
294define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
295; CHECK-LABEL: test_16xfloat_masked_shuff_mask1:
296; CHECK:       # %bb.0:
297; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
298; CHECK-NEXT:    vcmpeqps %zmm4, %zmm3, %k1
299; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15]
300; CHECK-NEXT:    vmovaps %zmm2, %zmm0
301; CHECK-NEXT:    retq
302  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
303  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
304  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
305  ret <16 x float> %res
306}
307
308define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
309; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask1:
310; CHECK:       # %bb.0:
311; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
312; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
313; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15]
314; CHECK-NEXT:    retq
315  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
316  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
317  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
318  ret <16 x float> %res
319}
320define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
321; CHECK-LABEL: test_16xfloat_masked_shuff_mask2:
322; CHECK:       # %bb.0:
323; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
324; CHECK-NEXT:    vcmpeqps %zmm4, %zmm3, %k1
325; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
326; CHECK-NEXT:    vmovaps %zmm2, %zmm0
327; CHECK-NEXT:    retq
328  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
329  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
330  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
331  ret <16 x float> %res
332}
333
334define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
335; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask2:
336; CHECK:       # %bb.0:
337; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
338; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
339; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
340; CHECK-NEXT:    retq
341  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
342  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
343  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
344  ret <16 x float> %res
345}
346define <16 x float> @test_16xfloat_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2) {
347; CHECK-LABEL: test_16xfloat_shuff_mask3:
348; CHECK:       # %bb.0:
349; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7],zmm1[0,1,4,5]
350; CHECK-NEXT:    retq
351  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
352  ret <16 x float> %res
353}
354define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
355; CHECK-LABEL: test_16xfloat_masked_shuff_mask3:
356; CHECK:       # %bb.0:
357; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
358; CHECK-NEXT:    vcmpeqps %zmm4, %zmm3, %k1
359; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11]
360; CHECK-NEXT:    vmovaps %zmm2, %zmm0
361; CHECK-NEXT:    retq
362  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
363  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
364  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
365  ret <16 x float> %res
366}
367
368define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
369; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask3:
370; CHECK:       # %bb.0:
371; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
372; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
373; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11]
374; CHECK-NEXT:    retq
375  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
376  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
377  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
378  ret <16 x float> %res
379}
380define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
381; CHECK-LABEL: test_16xfloat_shuff_mem_mask0:
382; CHECK:       # %bb.0:
383; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5],mem[4,5,2,3]
384; CHECK-NEXT:    retq
385  %vec2 = load <16 x float>, <16 x float>* %vec2p
386  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
387  ret <16 x float> %res
388}
389define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
390; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask0:
391; CHECK:       # %bb.0:
392; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
393; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
394; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7]
395; CHECK-NEXT:    vmovaps %zmm1, %zmm0
396; CHECK-NEXT:    retq
397  %vec2 = load <16 x float>, <16 x float>* %vec2p
398  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
399  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
400  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
401  ret <16 x float> %res
402}
403
404define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
405; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
406; CHECK:       # %bb.0:
407; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
408; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
409; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7]
410; CHECK-NEXT:    retq
411  %vec2 = load <16 x float>, <16 x float>* %vec2p
412  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
413  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
414  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
415  ret <16 x float> %res
416}
417
418define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
419; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask1:
420; CHECK:       # %bb.0:
421; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
422; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
423; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7]
424; CHECK-NEXT:    vmovaps %zmm1, %zmm0
425; CHECK-NEXT:    retq
426  %vec2 = load <16 x float>, <16 x float>* %vec2p
427  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
428  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
429  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
430  ret <16 x float> %res
431}
432
433define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
434; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
435; CHECK:       # %bb.0:
436; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
437; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
438; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7]
439; CHECK-NEXT:    retq
440  %vec2 = load <16 x float>, <16 x float>* %vec2p
441  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
442  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
443  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
444  ret <16 x float> %res
445}
446
447define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
448; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask2:
449; CHECK:       # %bb.0:
450; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
451; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
452; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11]
453; CHECK-NEXT:    vmovaps %zmm1, %zmm0
454; CHECK-NEXT:    retq
455  %vec2 = load <16 x float>, <16 x float>* %vec2p
456  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
457  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
458  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
459  ret <16 x float> %res
460}
461
462define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
463; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
464; CHECK:       # %bb.0:
465; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
466; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
467; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11]
468; CHECK-NEXT:    retq
469  %vec2 = load <16 x float>, <16 x float>* %vec2p
470  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
471  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
472  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
473  ret <16 x float> %res
474}
475
476define <16 x float> @test_16xfloat_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
477; CHECK-LABEL: test_16xfloat_shuff_mem_mask3:
478; CHECK:       # %bb.0:
479; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,6,7]
480; CHECK-NEXT:    retq
481  %vec2 = load <16 x float>, <16 x float>* %vec2p
482  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
483  ret <16 x float> %res
484}
485define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
486; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask3:
487; CHECK:       # %bb.0:
488; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
489; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
490; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15]
491; CHECK-NEXT:    vmovaps %zmm1, %zmm0
492; CHECK-NEXT:    retq
493  %vec2 = load <16 x float>, <16 x float>* %vec2p
494  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
495  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
496  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
497  ret <16 x float> %res
498}
499
500define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
501; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
502; CHECK:       # %bb.0:
503; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
504; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
505; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15]
506; CHECK-NEXT:    retq
507  %vec2 = load <16 x float>, <16 x float>* %vec2p
508  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
509  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
510  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
511  ret <16 x float> %res
512}
513
514define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2) {
515; CHECK-LABEL: test_4xdouble_shuff_mask0:
516; CHECK:       # %bb.0:
517; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
518; CHECK-NEXT:    retq
519  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
520  ret <4 x double> %res
521}
522define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
523; CHECK-LABEL: test_4xdouble_masked_shuff_mask0:
524; CHECK:       # %bb.0:
525; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
526; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm3, %k1
527; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
528; CHECK-NEXT:    vmovapd %ymm2, %ymm0
529; CHECK-NEXT:    retq
530  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
531  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
532  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
533  ret <4 x double> %res
534}
535
536define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
537; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask0:
538; CHECK:       # %bb.0:
539; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
540; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
541; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
542; CHECK-NEXT:    retq
543  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
544  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
545  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
546  ret <4 x double> %res
547}
548define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
549; CHECK-LABEL: test_4xdouble_masked_shuff_mask1:
550; CHECK:       # %bb.0:
551; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
552; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm3, %k1
553; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
554; CHECK-NEXT:    vmovapd %ymm2, %ymm0
555; CHECK-NEXT:    retq
556  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
557  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
558  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
559  ret <4 x double> %res
560}
561
562define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
563; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask1:
564; CHECK:       # %bb.0:
565; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
566; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
567; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
568; CHECK-NEXT:    retq
569  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
570  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
571  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
572  ret <4 x double> %res
573}
574define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
575; CHECK-LABEL: test_4xdouble_masked_shuff_mask2:
576; CHECK:       # %bb.0:
577; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
578; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm3, %k1
579; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
580; CHECK-NEXT:    vmovapd %ymm2, %ymm0
581; CHECK-NEXT:    retq
582  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
583  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
584  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
585  ret <4 x double> %res
586}
587
588define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
589; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask2:
590; CHECK:       # %bb.0:
591; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
592; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
593; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
594; CHECK-NEXT:    retq
595  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
596  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
597  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
598  ret <4 x double> %res
599}
600define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) {
601; CHECK-LABEL: test_4xdouble_shuff_mask3:
602; CHECK:       # %bb.0:
603; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
604; CHECK-NEXT:    retq
605  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
606  ret <4 x double> %res
607}
608define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
609; CHECK-LABEL: test_4xdouble_masked_shuff_mask3:
610; CHECK:       # %bb.0:
611; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
612; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm3, %k1
613; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
614; CHECK-NEXT:    vmovapd %ymm2, %ymm0
615; CHECK-NEXT:    retq
616  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
617  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
618  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
619  ret <4 x double> %res
620}
621
622define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
623; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask3:
624; CHECK:       # %bb.0:
625; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
626; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
627; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
628; CHECK-NEXT:    retq
629  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
630  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
631  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
632  ret <4 x double> %res
633}
634define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
635; CHECK-LABEL: test_4xdouble_shuff_mem_mask0:
636; CHECK:       # %bb.0:
637; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
638; CHECK-NEXT:    retq
639  %vec2 = load <4 x double>, <4 x double>* %vec2p
640  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
641  ret <4 x double> %res
642}
643define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
644; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask0:
645; CHECK:       # %bb.0:
646; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
647; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
648; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
649; CHECK-NEXT:    vmovapd %ymm1, %ymm0
650; CHECK-NEXT:    retq
651  %vec2 = load <4 x double>, <4 x double>* %vec2p
652  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
653  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
654  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
655  ret <4 x double> %res
656}
657
658define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
659; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
660; CHECK:       # %bb.0:
661; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
662; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
663; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
664; CHECK-NEXT:    retq
665  %vec2 = load <4 x double>, <4 x double>* %vec2p
666  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
667  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
668  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
669  ret <4 x double> %res
670}
671
672define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
673; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask1:
674; CHECK:       # %bb.0:
675; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
676; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
677; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
678; CHECK-NEXT:    vmovapd %ymm1, %ymm0
679; CHECK-NEXT:    retq
680  %vec2 = load <4 x double>, <4 x double>* %vec2p
681  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
682  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
683  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
684  ret <4 x double> %res
685}
686
687define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
688; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
689; CHECK:       # %bb.0:
690; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
691; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
692; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
693; CHECK-NEXT:    retq
694  %vec2 = load <4 x double>, <4 x double>* %vec2p
695  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
696  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
697  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
698  ret <4 x double> %res
699}
700
701define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
702; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask2:
703; CHECK:       # %bb.0:
704; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
705; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
706; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
707; CHECK-NEXT:    vmovapd %ymm1, %ymm0
708; CHECK-NEXT:    retq
709  %vec2 = load <4 x double>, <4 x double>* %vec2p
710  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
711  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
712  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
713  ret <4 x double> %res
714}
715
716define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
717; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
718; CHECK:       # %bb.0:
719; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
720; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
721; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
722; CHECK-NEXT:    retq
723  %vec2 = load <4 x double>, <4 x double>* %vec2p
724  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
725  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
726  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
727  ret <4 x double> %res
728}
729
730define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
731; CHECK-LABEL: test_4xdouble_shuff_mem_mask3:
732; CHECK:       # %bb.0:
733; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
734; CHECK-NEXT:    retq
735  %vec2 = load <4 x double>, <4 x double>* %vec2p
736  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
737  ret <4 x double> %res
738}
739define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
740; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask3:
741; CHECK:       # %bb.0:
742; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
743; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
744; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
745; CHECK-NEXT:    vmovapd %ymm1, %ymm0
746; CHECK-NEXT:    retq
747  %vec2 = load <4 x double>, <4 x double>* %vec2p
748  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
749  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
750  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
751  ret <4 x double> %res
752}
753
754define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
755; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
756; CHECK:       # %bb.0:
757; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
758; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
759; CHECK-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
760; CHECK-NEXT:    retq
761  %vec2 = load <4 x double>, <4 x double>* %vec2p
762  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
763  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
764  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
765  ret <4 x double> %res
766}
767
768define <8 x double> @test_8xdouble_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2) {
769; CHECK-LABEL: test_8xdouble_shuff_mask0:
770; CHECK:       # %bb.0:
771; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,2,3],zmm1[6,7,0,1]
772; CHECK-NEXT:    retq
773  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
774  ret <8 x double> %res
775}
776define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
777; CHECK-LABEL: test_8xdouble_masked_shuff_mask0:
778; CHECK:       # %bb.0:
779; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
780; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm3, %k1
781; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1]
782; CHECK-NEXT:    vmovapd %zmm2, %zmm0
783; CHECK-NEXT:    retq
784  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
785  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
786  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
787  ret <8 x double> %res
788}
789
790define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
791; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask0:
792; CHECK:       # %bb.0:
793; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
794; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
795; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1]
796; CHECK-NEXT:    retq
797  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
798  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
799  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
800  ret <8 x double> %res
801}
802define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
803; CHECK-LABEL: test_8xdouble_masked_shuff_mask1:
804; CHECK:       # %bb.0:
805; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
806; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm3, %k1
807; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5]
808; CHECK-NEXT:    vmovapd %zmm2, %zmm0
809; CHECK-NEXT:    retq
810  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
811  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
812  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
813  ret <8 x double> %res
814}
815
816define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
817; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask1:
818; CHECK:       # %bb.0:
819; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
820; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
821; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5]
822; CHECK-NEXT:    retq
823  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
824  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
825  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
826  ret <8 x double> %res
827}
828define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
829; CHECK-LABEL: test_8xdouble_masked_shuff_mask2:
830; CHECK:       # %bb.0:
831; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
832; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm3, %k1
833; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1]
834; CHECK-NEXT:    vmovapd %zmm2, %zmm0
835; CHECK-NEXT:    retq
836  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
837  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
838  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
839  ret <8 x double> %res
840}
841
842define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
843; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask2:
844; CHECK:       # %bb.0:
845; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
846; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
847; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1]
848; CHECK-NEXT:    retq
849  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
850  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
851  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
852  ret <8 x double> %res
853}
854define <8 x double> @test_8xdouble_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2) {
855; CHECK-LABEL: test_8xdouble_shuff_mask3:
856; CHECK:       # %bb.0:
857; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,2,3]
858; CHECK-NEXT:    retq
859  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
860  ret <8 x double> %res
861}
862define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
863; CHECK-LABEL: test_8xdouble_masked_shuff_mask3:
864; CHECK:       # %bb.0:
865; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
866; CHECK-NEXT:    vcmpeqpd %zmm4, %zmm3, %k1
867; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3]
868; CHECK-NEXT:    vmovapd %zmm2, %zmm0
869; CHECK-NEXT:    retq
870  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
871  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
872  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
873  ret <8 x double> %res
874}
875
876define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
877; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask3:
878; CHECK:       # %bb.0:
879; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
880; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
881; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3]
882; CHECK-NEXT:    retq
883  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
884  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
885  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
886  ret <8 x double> %res
887}
888define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
889; CHECK-LABEL: test_8xdouble_shuff_mem_mask0:
890; CHECK:       # %bb.0:
891; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1]
892; CHECK-NEXT:    retq
893  %vec2 = load <8 x double>, <8 x double>* %vec2p
894  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
895  ret <8 x double> %res
896}
897define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
898; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask0:
899; CHECK:       # %bb.0:
900; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
901; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
902; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1]
903; CHECK-NEXT:    vmovapd %zmm1, %zmm0
904; CHECK-NEXT:    retq
905  %vec2 = load <8 x double>, <8 x double>* %vec2p
906  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
907  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
908  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
909  ret <8 x double> %res
910}
911
912define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
913; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
914; CHECK:       # %bb.0:
915; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
916; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
917; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1]
918; CHECK-NEXT:    retq
919  %vec2 = load <8 x double>, <8 x double>* %vec2p
920  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
921  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
922  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
923  ret <8 x double> %res
924}
925
926define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
927; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask1:
928; CHECK:       # %bb.0:
929; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
930; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
931; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3]
932; CHECK-NEXT:    vmovapd %zmm1, %zmm0
933; CHECK-NEXT:    retq
934  %vec2 = load <8 x double>, <8 x double>* %vec2p
935  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
936  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
937  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
938  ret <8 x double> %res
939}
940
941define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
942; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
943; CHECK:       # %bb.0:
944; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
945; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
946; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3]
947; CHECK-NEXT:    retq
948  %vec2 = load <8 x double>, <8 x double>* %vec2p
949  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
950  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
951  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
952  ret <8 x double> %res
953}
954
955define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
956; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask2:
957; CHECK:       # %bb.0:
958; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
959; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
960; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5]
961; CHECK-NEXT:    vmovapd %zmm1, %zmm0
962; CHECK-NEXT:    retq
963  %vec2 = load <8 x double>, <8 x double>* %vec2p
964  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
965  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
966  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
967  ret <8 x double> %res
968}
969
970define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
971; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
972; CHECK:       # %bb.0:
973; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
974; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
975; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5]
976; CHECK-NEXT:    retq
977  %vec2 = load <8 x double>, <8 x double>* %vec2p
978  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
979  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
980  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
981  ret <8 x double> %res
982}
983
984define <8 x double> @test_8xdouble_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
985; CHECK-LABEL: test_8xdouble_shuff_mem_mask3:
986; CHECK:       # %bb.0:
987; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1]
988; CHECK-NEXT:    retq
989  %vec2 = load <8 x double>, <8 x double>* %vec2p
990  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
991  ret <8 x double> %res
992}
993define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
994; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask3:
995; CHECK:       # %bb.0:
996; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
997; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
998; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1]
999; CHECK-NEXT:    vmovapd %zmm1, %zmm0
1000; CHECK-NEXT:    retq
1001  %vec2 = load <8 x double>, <8 x double>* %vec2p
1002  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
1003  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
1004  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
1005  ret <8 x double> %res
1006}
1007
1008define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
1009; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
1010; CHECK:       # %bb.0:
1011; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
1012; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
1013; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1]
1014; CHECK-NEXT:    retq
1015  %vec2 = load <8 x double>, <8 x double>* %vec2p
1016  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
1017  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
1018  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
1019  ret <8 x double> %res
1020}
1021
1022define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
1023; CHECK-LABEL: test_8xi32_shuff_mask0:
1024; CHECK:       # %bb.0:
1025; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1026; CHECK-NEXT:    retq
1027  %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1028  ret <8 x i32> %res
1029}
1030define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
1031; CHECK-LABEL: test_8xi32_masked_shuff_mask0:
1032; CHECK:       # %bb.0:
1033; CHECK-NEXT:    vptestnmd %ymm3, %ymm3, %k1
1034; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1035; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
1036; CHECK-NEXT:    retq
1037  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1038  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1039  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
1040  ret <8 x i32> %res
1041}
1042
1043define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
1044; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask0:
1045; CHECK:       # %bb.0:
1046; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
1047; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1048; CHECK-NEXT:    retq
1049  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1050  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1051  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1052  ret <8 x i32> %res
1053}
1054define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
1055; CHECK-LABEL: test_8xi32_masked_shuff_mask1:
1056; CHECK:       # %bb.0:
1057; CHECK-NEXT:    vptestnmd %ymm3, %ymm3, %k1
1058; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
1059; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
1060; CHECK-NEXT:    retq
1061  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1062  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1063  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
1064  ret <8 x i32> %res
1065}
1066
1067define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
1068; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask1:
1069; CHECK:       # %bb.0:
1070; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
1071; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
1072; CHECK-NEXT:    retq
1073  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1074  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1075  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1076  ret <8 x i32> %res
1077}
1078define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
1079; CHECK-LABEL: test_8xi32_masked_shuff_mask2:
1080; CHECK:       # %bb.0:
1081; CHECK-NEXT:    vptestnmd %ymm3, %ymm3, %k1
1082; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1083; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
1084; CHECK-NEXT:    retq
1085  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1086  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1087  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
1088  ret <8 x i32> %res
1089}
1090
1091define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
1092; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask2:
1093; CHECK:       # %bb.0:
1094; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
1095; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1096; CHECK-NEXT:    retq
1097  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1098  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1099  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1100  ret <8 x i32> %res
1101}
1102define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
1103; CHECK-LABEL: test_8xi32_shuff_mask3:
1104; CHECK:       # %bb.0:
1105; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
1106; CHECK-NEXT:    retq
1107  %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1108  ret <8 x i32> %res
1109}
1110define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
1111; CHECK-LABEL: test_8xi32_masked_shuff_mask3:
1112; CHECK:       # %bb.0:
1113; CHECK-NEXT:    vptestnmd %ymm3, %ymm3, %k1
1114; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
1115; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
1116; CHECK-NEXT:    retq
1117  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1118  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1119  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
1120  ret <8 x i32> %res
1121}
1122
1123define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
1124; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask3:
1125; CHECK:       # %bb.0:
1126; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
1127; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
1128; CHECK-NEXT:    retq
1129  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1130  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1131  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1132  ret <8 x i32> %res
1133}
1134define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
1135; CHECK-LABEL: test_8xi32_shuff_mem_mask0:
1136; CHECK:       # %bb.0:
1137; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
1138; CHECK-NEXT:    retq
1139  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
1140  %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1141  ret <8 x i32> %res
1142}
1143define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
1144; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask0:
1145; CHECK:       # %bb.0:
1146; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
1147; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
1148; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1149; CHECK-NEXT:    retq
1150  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
1151  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1152  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1153  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
1154  ret <8 x i32> %res
1155}
1156
1157define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
1158; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
1159; CHECK:       # %bb.0:
1160; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
1161; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
1162; CHECK-NEXT:    retq
1163  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
1164  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1165  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1166  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1167  ret <8 x i32> %res
1168}
1169
1170define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
1171; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask1:
1172; CHECK:       # %bb.0:
1173; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
1174; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
1175; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1176; CHECK-NEXT:    retq
1177  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
1178  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1179  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1180  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
1181  ret <8 x i32> %res
1182}
1183
1184define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
1185; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
1186; CHECK:       # %bb.0:
1187; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
1188; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
1189; CHECK-NEXT:    retq
1190  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
1191  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1192  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1193  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1194  ret <8 x i32> %res
1195}
1196
1197define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
1198; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask2:
1199; CHECK:       # %bb.0:
1200; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
1201; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
1202; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1203; CHECK-NEXT:    retq
1204  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
1205  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1206  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1207  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
1208  ret <8 x i32> %res
1209}
1210
1211define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
1212; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
1213; CHECK:       # %bb.0:
1214; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
1215; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
1216; CHECK-NEXT:    retq
1217  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
1218  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1219  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1220  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1221  ret <8 x i32> %res
1222}
1223
1224define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
1225; CHECK-LABEL: test_8xi32_shuff_mem_mask3:
1226; CHECK:       # %bb.0:
1227; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
1228; CHECK-NEXT:    retq
1229  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
1230  %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1231  ret <8 x i32> %res
1232}
1233define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
1234; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask3:
1235; CHECK:       # %bb.0:
1236; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
1237; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
1238; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1239; CHECK-NEXT:    retq
1240  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
1241  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1242  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1243  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
1244  ret <8 x i32> %res
1245}
1246
1247define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
1248; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
1249; CHECK:       # %bb.0:
1250; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
1251; CHECK-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
1252; CHECK-NEXT:    retq
1253  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
1254  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1255  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1256  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1257  ret <8 x i32> %res
1258}
1259
1260define <16 x i32> @test_16xi32_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2) {
1261; CHECK-LABEL: test_16xi32_shuff_mask0:
1262; CHECK:       # %bb.0:
1263; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,6,7]
1264; CHECK-NEXT:    retq
1265  %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
1266  ret <16 x i32> %res
1267}
1268define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
1269; CHECK-LABEL: test_16xi32_masked_shuff_mask0:
1270; CHECK:       # %bb.0:
1271; CHECK-NEXT:    vptestnmd %zmm3, %zmm3, %k1
1272; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
1273; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
1274; CHECK-NEXT:    retq
1275  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
1276  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1277  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
1278  ret <16 x i32> %res
1279}
1280
1281define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
1282; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask0:
1283; CHECK:       # %bb.0:
1284; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
1285; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
1286; CHECK-NEXT:    retq
1287  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
1288  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1289  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
1290  ret <16 x i32> %res
1291}
1292define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
1293; CHECK-LABEL: test_16xi32_masked_shuff_mask1:
1294; CHECK:       # %bb.0:
1295; CHECK-NEXT:    vptestnmd %zmm3, %zmm3, %k1
1296; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7]
1297; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
1298; CHECK-NEXT:    retq
1299  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
1300  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1301  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
1302  ret <16 x i32> %res
1303}
1304
1305define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
1306; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask1:
1307; CHECK:       # %bb.0:
1308; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
1309; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7]
1310; CHECK-NEXT:    retq
1311  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
1312  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1313  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
1314  ret <16 x i32> %res
1315}
1316define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
1317; CHECK-LABEL: test_16xi32_masked_shuff_mask2:
1318; CHECK:       # %bb.0:
1319; CHECK-NEXT:    vptestnmd %zmm3, %zmm3, %k1
1320; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3]
1321; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
1322; CHECK-NEXT:    retq
1323  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
1324  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1325  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
1326  ret <16 x i32> %res
1327}
1328
1329define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
1330; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask2:
1331; CHECK:       # %bb.0:
1332; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
1333; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3]
1334; CHECK-NEXT:    retq
1335  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
1336  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1337  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
1338  ret <16 x i32> %res
1339}
1340define <16 x i32> @test_16xi32_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2) {
1341; CHECK-LABEL: test_16xi32_shuff_mask3:
1342; CHECK:       # %bb.0:
1343; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],zmm1[4,5,2,3]
1344; CHECK-NEXT:    retq
1345  %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
1346  ret <16 x i32> %res
1347}
1348define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
1349; CHECK-LABEL: test_16xi32_masked_shuff_mask3:
1350; CHECK:       # %bb.0:
1351; CHECK-NEXT:    vptestnmd %zmm3, %zmm3, %k1
1352; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
1353; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
1354; CHECK-NEXT:    retq
1355  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
1356  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1357  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
1358  ret <16 x i32> %res
1359}
1360
1361define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
1362; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask3:
1363; CHECK:       # %bb.0:
1364; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
1365; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
1366; CHECK-NEXT:    retq
1367  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
1368  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1369  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
1370  ret <16 x i32> %res
1371}
1372define <16 x i32> @test_16xi32_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p) {
1373; CHECK-LABEL: test_16xi32_shuff_mem_mask0:
1374; CHECK:       # %bb.0:
1375; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],mem[4,5,0,1]
1376; CHECK-NEXT:    retq
1377  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
1378  %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
1379  ret <16 x i32> %res
1380}
1381define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
1382; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask0:
1383; CHECK:       # %bb.0:
1384; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
1385; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
1386; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1387; CHECK-NEXT:    retq
1388  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
1389  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
1390  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1391  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
1392  ret <16 x i32> %res
1393}
1394
1395define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
1396; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
1397; CHECK:       # %bb.0:
1398; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
1399; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
1400; CHECK-NEXT:    retq
1401  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
1402  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
1403  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1404  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
1405  ret <16 x i32> %res
1406}
1407
1408define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
1409; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask1:
1410; CHECK:       # %bb.0:
1411; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
1412; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11]
1413; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1414; CHECK-NEXT:    retq
1415  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
1416  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
1417  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1418  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
1419  ret <16 x i32> %res
1420}
1421
1422define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
1423; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
1424; CHECK:       # %bb.0:
1425; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
1426; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11]
1427; CHECK-NEXT:    retq
1428  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
1429  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
1430  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1431  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
1432  ret <16 x i32> %res
1433}
1434
1435define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
1436; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask2:
1437; CHECK:       # %bb.0:
1438; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
1439; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15]
1440; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1441; CHECK-NEXT:    retq
1442  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
1443  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
1444  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1445  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
1446  ret <16 x i32> %res
1447}
1448
1449define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
1450; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
1451; CHECK:       # %bb.0:
1452; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
1453; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15]
1454; CHECK-NEXT:    retq
1455  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
1456  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
1457  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1458  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
1459  ret <16 x i32> %res
1460}
1461
1462define <16 x i32> @test_16xi32_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p) {
1463; CHECK-LABEL: test_16xi32_shuff_mem_mask3:
1464; CHECK:       # %bb.0:
1465; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[2,3,6,7]
1466; CHECK-NEXT:    retq
1467  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
1468  %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
1469  ret <16 x i32> %res
1470}
1471define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
1472; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask3:
1473; CHECK:       # %bb.0:
1474; CHECK-NEXT:    vptestnmd %zmm2, %zmm2, %k1
1475; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
1476; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1477; CHECK-NEXT:    retq
1478  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
1479  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
1480  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1481  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
1482  ret <16 x i32> %res
1483}
1484
1485define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
1486; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
1487; CHECK:       # %bb.0:
1488; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
1489; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
1490; CHECK-NEXT:    retq
1491  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
1492  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
1493  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1494  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
1495  ret <16 x i32> %res
1496}
1497
1498define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
1499; CHECK-LABEL: test_4xi64_shuff_mask0:
1500; CHECK:       # %bb.0:
1501; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
1502; CHECK-NEXT:    retq
1503  %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
1504  ret <4 x i64> %res
1505}
1506define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
1507; CHECK-LABEL: test_4xi64_masked_shuff_mask0:
1508; CHECK:       # %bb.0:
1509; CHECK-NEXT:    vptestnmq %ymm3, %ymm3, %k1
1510; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
1511; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
1512; CHECK-NEXT:    retq
1513  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
1514  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1515  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
1516  ret <4 x i64> %res
1517}
1518
1519define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
1520; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask0:
1521; CHECK:       # %bb.0:
1522; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
1523; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
1524; CHECK-NEXT:    retq
1525  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
1526  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1527  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1528  ret <4 x i64> %res
1529}
1530define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
1531; CHECK-LABEL: test_4xi64_masked_shuff_mask1:
1532; CHECK:       # %bb.0:
1533; CHECK-NEXT:    vptestnmq %ymm3, %ymm3, %k1
1534; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
1535; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
1536; CHECK-NEXT:    retq
1537  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1538  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1539  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
1540  ret <4 x i64> %res
1541}
1542
1543define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
1544; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask1:
1545; CHECK:       # %bb.0:
1546; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
1547; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1548; CHECK-NEXT:    retq
1549  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1550  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1551  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1552  ret <4 x i64> %res
1553}
1554define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
1555; CHECK-LABEL: test_4xi64_masked_shuff_mask2:
1556; CHECK:       # %bb.0:
1557; CHECK-NEXT:    vptestnmq %ymm3, %ymm3, %k1
1558; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
1559; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
1560; CHECK-NEXT:    retq
1561  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
1562  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1563  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
1564  ret <4 x i64> %res
1565}
1566
1567define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
1568; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask2:
1569; CHECK:       # %bb.0:
1570; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
1571; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
1572; CHECK-NEXT:    retq
1573  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
1574  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1575  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1576  ret <4 x i64> %res
1577}
1578define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
1579; CHECK-LABEL: test_4xi64_shuff_mask3:
1580; CHECK:       # %bb.0:
1581; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1582; CHECK-NEXT:    retq
1583  %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1584  ret <4 x i64> %res
1585}
1586define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
1587; CHECK-LABEL: test_4xi64_masked_shuff_mask3:
1588; CHECK:       # %bb.0:
1589; CHECK-NEXT:    vptestnmq %ymm3, %ymm3, %k1
1590; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
1591; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
1592; CHECK-NEXT:    retq
1593  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1594  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1595  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
1596  ret <4 x i64> %res
1597}
1598
1599define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
1600; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask3:
1601; CHECK:       # %bb.0:
1602; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
1603; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1604; CHECK-NEXT:    retq
1605  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1606  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1607  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1608  ret <4 x i64> %res
1609}
1610define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
1611; CHECK-LABEL: test_4xi64_shuff_mem_mask0:
1612; CHECK:       # %bb.0:
1613; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
1614; CHECK-NEXT:    retq
1615  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
1616  %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1617  ret <4 x i64> %res
1618}
1619define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
1620; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask0:
1621; CHECK:       # %bb.0:
1622; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
1623; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
1624; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1625; CHECK-NEXT:    retq
1626  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
1627  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1628  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1629  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
1630  ret <4 x i64> %res
1631}
1632
1633define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
1634; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
1635; CHECK:       # %bb.0:
1636; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
1637; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
1638; CHECK-NEXT:    retq
1639  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
1640  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1641  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1642  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1643  ret <4 x i64> %res
1644}
1645
1646define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
1647; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask1:
1648; CHECK:       # %bb.0:
1649; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
1650; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
1651; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1652; CHECK-NEXT:    retq
1653  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
1654  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
1655  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1656  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
1657  ret <4 x i64> %res
1658}
1659
1660define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
1661; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
1662; CHECK:       # %bb.0:
1663; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
1664; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
1665; CHECK-NEXT:    retq
1666  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
1667  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
1668  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1669  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1670  ret <4 x i64> %res
1671}
1672
1673define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
1674; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask2:
1675; CHECK:       # %bb.0:
1676; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
1677; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
1678; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1679; CHECK-NEXT:    retq
1680  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
1681  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
1682  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1683  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
1684  ret <4 x i64> %res
1685}
1686
1687define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
1688; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
1689; CHECK:       # %bb.0:
1690; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
1691; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
1692; CHECK-NEXT:    retq
1693  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
1694  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
1695  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1696  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1697  ret <4 x i64> %res
1698}
1699
1700define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
1701; CHECK-LABEL: test_4xi64_shuff_mem_mask3:
1702; CHECK:       # %bb.0:
1703; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
1704; CHECK-NEXT:    retq
1705  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
1706  %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1707  ret <4 x i64> %res
1708}
1709define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
1710; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask3:
1711; CHECK:       # %bb.0:
1712; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
1713; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
1714; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1715; CHECK-NEXT:    retq
1716  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
1717  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1718  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1719  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
1720  ret <4 x i64> %res
1721}
1722
1723define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
1724; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
1725; CHECK:       # %bb.0:
1726; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
1727; CHECK-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
1728; CHECK-NEXT:    retq
1729  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
1730  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1731  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1732  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1733  ret <4 x i64> %res
1734}
1735
1736define <8 x i64> @test_8xi64_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2) {
1737; CHECK-LABEL: test_8xi64_shuff_mask0:
1738; CHECK:       # %bb.0:
1739; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,4,5]
1740; CHECK-NEXT:    retq
1741  %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
1742  ret <8 x i64> %res
1743}
1744define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
1745; CHECK-LABEL: test_8xi64_masked_shuff_mask0:
1746; CHECK:       # %bb.0:
1747; CHECK-NEXT:    vptestnmq %zmm3, %zmm3, %k1
1748; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5]
1749; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
1750; CHECK-NEXT:    retq
1751  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
1752  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1753  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
1754  ret <8 x i64> %res
1755}
1756
1757define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
1758; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask0:
1759; CHECK:       # %bb.0:
1760; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
1761; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5]
1762; CHECK-NEXT:    retq
1763  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
1764  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1765  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1766  ret <8 x i64> %res
1767}
1768define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
1769; CHECK-LABEL: test_8xi64_masked_shuff_mask1:
1770; CHECK:       # %bb.0:
1771; CHECK-NEXT:    vptestnmq %zmm3, %zmm3, %k1
1772; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5]
1773; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
1774; CHECK-NEXT:    retq
1775  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
1776  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1777  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
1778  ret <8 x i64> %res
1779}
1780
1781define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
1782; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask1:
1783; CHECK:       # %bb.0:
1784; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
1785; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5]
1786; CHECK-NEXT:    retq
1787  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
1788  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1789  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1790  ret <8 x i64> %res
1791}
1792define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
1793; CHECK-LABEL: test_8xi64_masked_shuff_mask2:
1794; CHECK:       # %bb.0:
1795; CHECK-NEXT:    vptestnmq %zmm3, %zmm3, %k1
1796; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1]
1797; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
1798; CHECK-NEXT:    retq
1799  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
1800  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1801  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
1802  ret <8 x i64> %res
1803}
1804
1805define <8 x i64> @test_8xi64_zero_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
1806; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask2:
1807; CHECK:       # %bb.0:
1808; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
1809; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1]
1810; CHECK-NEXT:    retq
1811  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
1812  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1813  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1814  ret <8 x i64> %res
1815}
1816define <8 x i64> @test_8xi64_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2) {
1817; CHECK-LABEL: test_8xi64_shuff_mask3:
1818; CHECK:       # %bb.0:
1819; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,6,7],zmm1[4,5,2,3]
1820; CHECK-NEXT:    retq
1821  %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
1822  ret <8 x i64> %res
1823}
1824define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
1825; CHECK-LABEL: test_8xi64_masked_shuff_mask3:
1826; CHECK:       # %bb.0:
1827; CHECK-NEXT:    vptestnmq %zmm3, %zmm3, %k1
1828; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3]
1829; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
1830; CHECK-NEXT:    retq
1831  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
1832  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1833  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
1834  ret <8 x i64> %res
1835}
1836
1837define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
1838; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask3:
1839; CHECK:       # %bb.0:
1840; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
1841; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3]
1842; CHECK-NEXT:    retq
1843  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
1844  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1845  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1846  ret <8 x i64> %res
1847}
1848define <8 x i64> @test_8xi64_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p) {
1849; CHECK-LABEL: test_8xi64_shuff_mem_mask0:
1850; CHECK:       # %bb.0:
1851; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3]
1852; CHECK-NEXT:    retq
1853  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
1854  %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
1855  ret <8 x i64> %res
1856}
1857define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
1858; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask0:
1859; CHECK:       # %bb.0:
1860; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
1861; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3]
1862; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1863; CHECK-NEXT:    retq
1864  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
1865  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
1866  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1867  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
1868  ret <8 x i64> %res
1869}
1870
1871define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
1872; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
1873; CHECK:       # %bb.0:
1874; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
1875; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3]
1876; CHECK-NEXT:    retq
1877  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
1878  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
1879  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1880  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1881  ret <8 x i64> %res
1882}
1883
1884define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
1885; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask1:
1886; CHECK:       # %bb.0:
1887; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
1888; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1]
1889; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1890; CHECK-NEXT:    retq
1891  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
1892  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
1893  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1894  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
1895  ret <8 x i64> %res
1896}
1897
1898define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
1899; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
1900; CHECK:       # %bb.0:
1901; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
1902; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1]
1903; CHECK-NEXT:    retq
1904  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
1905  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
1906  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1907  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1908  ret <8 x i64> %res
1909}
1910
1911define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
1912; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask2:
1913; CHECK:       # %bb.0:
1914; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
1915; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3]
1916; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1917; CHECK-NEXT:    retq
1918  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
1919  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
1920  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1921  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
1922  ret <8 x i64> %res
1923}
1924
1925define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
1926; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
1927; CHECK:       # %bb.0:
1928; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
1929; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3]
1930; CHECK-NEXT:    retq
1931  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
1932  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
1933  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1934  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1935  ret <8 x i64> %res
1936}
1937
1938define <8 x i64> @test_8xi64_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p) {
1939; CHECK-LABEL: test_8xi64_shuff_mem_mask3:
1940; CHECK:       # %bb.0:
1941; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3]
1942; CHECK-NEXT:    retq
1943  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
1944  %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
1945  ret <8 x i64> %res
1946}
1947define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
1948; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask3:
1949; CHECK:       # %bb.0:
1950; CHECK-NEXT:    vptestnmq %zmm2, %zmm2, %k1
1951; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3]
1952; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1953; CHECK-NEXT:    retq
1954  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
1955  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
1956  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1957  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
1958  ret <8 x i64> %res
1959}
1960
1961define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
1962; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
1963; CHECK:       # %bb.0:
1964; CHECK-NEXT:    vptestnmq %zmm1, %zmm1, %k1
1965; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3]
1966; CHECK-NEXT:    retq
1967  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
1968  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
1969  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1970  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1971  ret <8 x i64> %res
1972}
1973
1974