1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s
3
4define <4 x float> @test_4xfloat_dup_high(<4 x float> %vec) {
5; CHECK-LABEL: test_4xfloat_dup_high:
6; CHECK:       # %bb.0:
7; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
8; CHECK-NEXT:    retq
9  %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
10  ret <4 x float> %res
11}
12define <4 x float> @test_masked_4xfloat_dup_high_mask0(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
13; CHECK-LABEL: test_masked_4xfloat_dup_high_mask0:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
16; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
17; CHECK-NEXT:    vmovshdup {{.*#+}} xmm1 {%k1} = xmm0[1,1,3,3]
18; CHECK-NEXT:    vmovaps %xmm1, %xmm0
19; CHECK-NEXT:    retq
20  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
21  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
22  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
23  ret <4 x float> %res
24}
25
26define <4 x float> @test_masked_z_4xfloat_dup_high_mask0(<4 x float> %vec, <4 x float> %mask) {
27; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mask0:
28; CHECK:       # %bb.0:
29; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
30; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
31; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
32; CHECK-NEXT:    retq
33  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
34  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
35  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
36  ret <4 x float> %res
37}
38define <4 x float> @test_masked_4xfloat_dup_high_mask1(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
39; CHECK-LABEL: test_masked_4xfloat_dup_high_mask1:
40; CHECK:       # %bb.0:
41; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
42; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
43; CHECK-NEXT:    vmovshdup {{.*#+}} xmm1 {%k1} = xmm0[1,1,3,3]
44; CHECK-NEXT:    vmovaps %xmm1, %xmm0
45; CHECK-NEXT:    retq
46  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
47  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
48  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
49  ret <4 x float> %res
50}
51
52define <4 x float> @test_masked_z_4xfloat_dup_high_mask1(<4 x float> %vec, <4 x float> %mask) {
53; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mask1:
54; CHECK:       # %bb.0:
55; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
56; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
57; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
58; CHECK-NEXT:    retq
59  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
60  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
61  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
62  ret <4 x float> %res
63}
64define <4 x float> @test_masked_4xfloat_dup_high_mask2(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
65; CHECK-LABEL: test_masked_4xfloat_dup_high_mask2:
66; CHECK:       # %bb.0:
67; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
68; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
69; CHECK-NEXT:    vmovshdup {{.*#+}} xmm1 {%k1} = xmm0[1,1,3,3]
70; CHECK-NEXT:    vmovaps %xmm1, %xmm0
71; CHECK-NEXT:    retq
72  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
73  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
74  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
75  ret <4 x float> %res
76}
77
78define <4 x float> @test_masked_z_4xfloat_dup_high_mask2(<4 x float> %vec, <4 x float> %mask) {
79; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mask2:
80; CHECK:       # %bb.0:
81; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
82; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
83; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
84; CHECK-NEXT:    retq
85  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
86  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
87  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
88  ret <4 x float> %res
89}
90define <4 x float> @test_masked_4xfloat_dup_high_mask3(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
91; CHECK-LABEL: test_masked_4xfloat_dup_high_mask3:
92; CHECK:       # %bb.0:
93; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
94; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
95; CHECK-NEXT:    vmovshdup {{.*#+}} xmm1 {%k1} = xmm0[1,1,3,3]
96; CHECK-NEXT:    vmovaps %xmm1, %xmm0
97; CHECK-NEXT:    retq
98  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
99  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
100  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
101  ret <4 x float> %res
102}
103
104define <4 x float> @test_masked_z_4xfloat_dup_high_mask3(<4 x float> %vec, <4 x float> %mask) {
105; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mask3:
106; CHECK:       # %bb.0:
107; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
108; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
109; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
110; CHECK-NEXT:    retq
111  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
112  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
113  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
114  ret <4 x float> %res
115}
116define <4 x float> @test_masked_4xfloat_dup_high_mask4(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
117; CHECK-LABEL: test_masked_4xfloat_dup_high_mask4:
118; CHECK:       # %bb.0:
119; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
120; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
121; CHECK-NEXT:    vmovshdup {{.*#+}} xmm1 {%k1} = xmm0[1,1,3,3]
122; CHECK-NEXT:    vmovaps %xmm1, %xmm0
123; CHECK-NEXT:    retq
124  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
125  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
126  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
127  ret <4 x float> %res
128}
129
130define <4 x float> @test_masked_z_4xfloat_dup_high_mask4(<4 x float> %vec, <4 x float> %mask) {
131; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mask4:
132; CHECK:       # %bb.0:
133; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
134; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
135; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
136; CHECK-NEXT:    retq
137  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
138  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
139  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
140  ret <4 x float> %res
141}
142define <4 x float> @test_4xfloat_dup_high_mem(<4 x float>* %vp) {
143; CHECK-LABEL: test_4xfloat_dup_high_mem:
144; CHECK:       # %bb.0:
145; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 = mem[1,1,3,3]
146; CHECK-NEXT:    retq
147  %vec = load <4 x float>, <4 x float>* %vp
148  %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
149  ret <4 x float> %res
150}
151define <4 x float> @test_masked_4xfloat_dup_high_mem_mask0(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
152; CHECK-LABEL: test_masked_4xfloat_dup_high_mem_mask0:
153; CHECK:       # %bb.0:
154; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
155; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
156; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} = mem[1,1,3,3]
157; CHECK-NEXT:    retq
158  %vec = load <4 x float>, <4 x float>* %vp
159  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
160  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
161  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
162  ret <4 x float> %res
163}
164
165define <4 x float> @test_masked_z_4xfloat_dup_high_mem_mask0(<4 x float>* %vp, <4 x float> %mask) {
166; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mem_mask0:
167; CHECK:       # %bb.0:
168; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
169; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
170; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = mem[1,1,3,3]
171; CHECK-NEXT:    retq
172  %vec = load <4 x float>, <4 x float>* %vp
173  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
174  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
175  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
176  ret <4 x float> %res
177}
178define <4 x float> @test_masked_4xfloat_dup_high_mem_mask1(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
179; CHECK-LABEL: test_masked_4xfloat_dup_high_mem_mask1:
180; CHECK:       # %bb.0:
181; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
182; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
183; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} = mem[1,1,3,3]
184; CHECK-NEXT:    retq
185  %vec = load <4 x float>, <4 x float>* %vp
186  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
187  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
188  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
189  ret <4 x float> %res
190}
191
192define <4 x float> @test_masked_z_4xfloat_dup_high_mem_mask1(<4 x float>* %vp, <4 x float> %mask) {
193; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mem_mask1:
194; CHECK:       # %bb.0:
195; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
196; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
197; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = mem[1,1,3,3]
198; CHECK-NEXT:    retq
199  %vec = load <4 x float>, <4 x float>* %vp
200  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
201  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
202  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
203  ret <4 x float> %res
204}
205define <4 x float> @test_masked_4xfloat_dup_high_mem_mask2(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
206; CHECK-LABEL: test_masked_4xfloat_dup_high_mem_mask2:
207; CHECK:       # %bb.0:
208; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
209; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
210; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} = mem[1,1,3,3]
211; CHECK-NEXT:    retq
212  %vec = load <4 x float>, <4 x float>* %vp
213  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
214  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
215  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
216  ret <4 x float> %res
217}
218
219define <4 x float> @test_masked_z_4xfloat_dup_high_mem_mask2(<4 x float>* %vp, <4 x float> %mask) {
220; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mem_mask2:
221; CHECK:       # %bb.0:
222; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
223; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
224; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = mem[1,1,3,3]
225; CHECK-NEXT:    retq
226  %vec = load <4 x float>, <4 x float>* %vp
227  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
228  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
229  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
230  ret <4 x float> %res
231}
232define <4 x float> @test_masked_4xfloat_dup_high_mem_mask3(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
233; CHECK-LABEL: test_masked_4xfloat_dup_high_mem_mask3:
234; CHECK:       # %bb.0:
235; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
236; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
237; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} = mem[1,1,3,3]
238; CHECK-NEXT:    retq
239  %vec = load <4 x float>, <4 x float>* %vp
240  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
241  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
242  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
243  ret <4 x float> %res
244}
245
246define <4 x float> @test_masked_z_4xfloat_dup_high_mem_mask3(<4 x float>* %vp, <4 x float> %mask) {
247; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mem_mask3:
248; CHECK:       # %bb.0:
249; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
250; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
251; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = mem[1,1,3,3]
252; CHECK-NEXT:    retq
253  %vec = load <4 x float>, <4 x float>* %vp
254  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
255  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
256  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
257  ret <4 x float> %res
258}
259define <4 x float> @test_masked_4xfloat_dup_high_mem_mask4(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
260; CHECK-LABEL: test_masked_4xfloat_dup_high_mem_mask4:
261; CHECK:       # %bb.0:
262; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
263; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
264; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} = mem[1,1,3,3]
265; CHECK-NEXT:    retq
266  %vec = load <4 x float>, <4 x float>* %vp
267  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
268  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
269  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
270  ret <4 x float> %res
271}
272
273define <4 x float> @test_masked_z_4xfloat_dup_high_mem_mask4(<4 x float>* %vp, <4 x float> %mask) {
274; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mem_mask4:
275; CHECK:       # %bb.0:
276; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
277; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
278; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = mem[1,1,3,3]
279; CHECK-NEXT:    retq
280  %vec = load <4 x float>, <4 x float>* %vp
281  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
282  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
283  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
284  ret <4 x float> %res
285}
286define <8 x float> @test_8xfloat_dup_high(<8 x float> %vec) {
287; CHECK-LABEL: test_8xfloat_dup_high:
288; CHECK:       # %bb.0:
289; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
290; CHECK-NEXT:    retq
291  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
292  ret <8 x float> %res
293}
294define <8 x float> @test_masked_8xfloat_dup_high_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
295; CHECK-LABEL: test_masked_8xfloat_dup_high_mask0:
296; CHECK:       # %bb.0:
297; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
298; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
299; CHECK-NEXT:    vmovshdup {{.*#+}} ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
300; CHECK-NEXT:    vmovaps %ymm1, %ymm0
301; CHECK-NEXT:    retq
302  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
303  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
304  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
305  ret <8 x float> %res
306}
307
308define <8 x float> @test_masked_z_8xfloat_dup_high_mask0(<8 x float> %vec, <8 x float> %mask) {
309; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mask0:
310; CHECK:       # %bb.0:
311; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
312; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
313; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
314; CHECK-NEXT:    retq
315  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
316  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
317  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
318  ret <8 x float> %res
319}
320define <8 x float> @test_masked_8xfloat_dup_high_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
321; CHECK-LABEL: test_masked_8xfloat_dup_high_mask1:
322; CHECK:       # %bb.0:
323; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
324; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
325; CHECK-NEXT:    vmovshdup {{.*#+}} ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
326; CHECK-NEXT:    vmovaps %ymm1, %ymm0
327; CHECK-NEXT:    retq
328  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
329  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
330  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
331  ret <8 x float> %res
332}
333
334define <8 x float> @test_masked_z_8xfloat_dup_high_mask1(<8 x float> %vec, <8 x float> %mask) {
335; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mask1:
336; CHECK:       # %bb.0:
337; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
338; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
339; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
340; CHECK-NEXT:    retq
341  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
342  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
343  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
344  ret <8 x float> %res
345}
346define <8 x float> @test_masked_8xfloat_dup_high_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
347; CHECK-LABEL: test_masked_8xfloat_dup_high_mask2:
348; CHECK:       # %bb.0:
349; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
350; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
351; CHECK-NEXT:    vmovshdup {{.*#+}} ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
352; CHECK-NEXT:    vmovaps %ymm1, %ymm0
353; CHECK-NEXT:    retq
354  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
355  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
356  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
357  ret <8 x float> %res
358}
359
360define <8 x float> @test_masked_z_8xfloat_dup_high_mask2(<8 x float> %vec, <8 x float> %mask) {
361; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mask2:
362; CHECK:       # %bb.0:
363; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
364; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
365; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
366; CHECK-NEXT:    retq
367  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
368  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
369  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
370  ret <8 x float> %res
371}
372define <8 x float> @test_masked_8xfloat_dup_high_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
373; CHECK-LABEL: test_masked_8xfloat_dup_high_mask3:
374; CHECK:       # %bb.0:
375; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
376; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
377; CHECK-NEXT:    vmovshdup {{.*#+}} ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
378; CHECK-NEXT:    vmovaps %ymm1, %ymm0
379; CHECK-NEXT:    retq
380  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
381  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
382  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
383  ret <8 x float> %res
384}
385
386define <8 x float> @test_masked_z_8xfloat_dup_high_mask3(<8 x float> %vec, <8 x float> %mask) {
387; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mask3:
388; CHECK:       # %bb.0:
389; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
390; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
391; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
392; CHECK-NEXT:    retq
393  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
394  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
395  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
396  ret <8 x float> %res
397}
398define <8 x float> @test_masked_8xfloat_dup_high_mask4(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
399; CHECK-LABEL: test_masked_8xfloat_dup_high_mask4:
400; CHECK:       # %bb.0:
401; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
402; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
403; CHECK-NEXT:    vmovshdup {{.*#+}} ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
404; CHECK-NEXT:    vmovaps %ymm1, %ymm0
405; CHECK-NEXT:    retq
406  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
407  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
408  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
409  ret <8 x float> %res
410}
411
412define <8 x float> @test_masked_z_8xfloat_dup_high_mask4(<8 x float> %vec, <8 x float> %mask) {
413; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mask4:
414; CHECK:       # %bb.0:
415; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
416; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
417; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
418; CHECK-NEXT:    retq
419  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
420  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
421  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
422  ret <8 x float> %res
423}
424define <8 x float> @test_8xfloat_dup_high_mem(<8 x float>* %vp) {
425; CHECK-LABEL: test_8xfloat_dup_high_mem:
426; CHECK:       # %bb.0:
427; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 = mem[1,1,3,3,5,5,7,7]
428; CHECK-NEXT:    retq
429  %vec = load <8 x float>, <8 x float>* %vp
430  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
431  ret <8 x float> %res
432}
433define <8 x float> @test_masked_8xfloat_dup_high_mem_mask0(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
434; CHECK-LABEL: test_masked_8xfloat_dup_high_mem_mask0:
435; CHECK:       # %bb.0:
436; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
437; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
438; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} = mem[1,1,3,3,5,5,7,7]
439; CHECK-NEXT:    retq
440  %vec = load <8 x float>, <8 x float>* %vp
441  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
442  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
443  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
444  ret <8 x float> %res
445}
446
447define <8 x float> @test_masked_z_8xfloat_dup_high_mem_mask0(<8 x float>* %vp, <8 x float> %mask) {
448; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mem_mask0:
449; CHECK:       # %bb.0:
450; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
451; CHECK-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
452; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7]
453; CHECK-NEXT:    retq
454  %vec = load <8 x float>, <8 x float>* %vp
455  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
456  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
457  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
458  ret <8 x float> %res
459}
460define <8 x float> @test_masked_8xfloat_dup_high_mem_mask1(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
461; CHECK-LABEL: test_masked_8xfloat_dup_high_mem_mask1:
462; CHECK:       # %bb.0:
463; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
464; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
465; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} = mem[1,1,3,3,5,5,7,7]
466; CHECK-NEXT:    retq
467  %vec = load <8 x float>, <8 x float>* %vp
468  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
469  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
470  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
471  ret <8 x float> %res
472}
473
474define <8 x float> @test_masked_z_8xfloat_dup_high_mem_mask1(<8 x float>* %vp, <8 x float> %mask) {
475; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mem_mask1:
476; CHECK:       # %bb.0:
477; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
478; CHECK-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
479; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7]
480; CHECK-NEXT:    retq
481  %vec = load <8 x float>, <8 x float>* %vp
482  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
483  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
484  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
485  ret <8 x float> %res
486}
487define <8 x float> @test_masked_8xfloat_dup_high_mem_mask2(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
488; CHECK-LABEL: test_masked_8xfloat_dup_high_mem_mask2:
489; CHECK:       # %bb.0:
490; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
491; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
492; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} = mem[1,1,3,3,5,5,7,7]
493; CHECK-NEXT:    retq
494  %vec = load <8 x float>, <8 x float>* %vp
495  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
496  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
497  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
498  ret <8 x float> %res
499}
500
501define <8 x float> @test_masked_z_8xfloat_dup_high_mem_mask2(<8 x float>* %vp, <8 x float> %mask) {
502; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mem_mask2:
503; CHECK:       # %bb.0:
504; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
505; CHECK-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
506; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7]
507; CHECK-NEXT:    retq
508  %vec = load <8 x float>, <8 x float>* %vp
509  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
510  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
511  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
512  ret <8 x float> %res
513}
514define <8 x float> @test_masked_8xfloat_dup_high_mem_mask3(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
515; CHECK-LABEL: test_masked_8xfloat_dup_high_mem_mask3:
516; CHECK:       # %bb.0:
517; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
518; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
519; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} = mem[1,1,3,3,5,5,7,7]
520; CHECK-NEXT:    retq
521  %vec = load <8 x float>, <8 x float>* %vp
522  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
523  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
524  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
525  ret <8 x float> %res
526}
527
528define <8 x float> @test_masked_z_8xfloat_dup_high_mem_mask3(<8 x float>* %vp, <8 x float> %mask) {
529; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mem_mask3:
530; CHECK:       # %bb.0:
531; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
532; CHECK-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
533; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7]
534; CHECK-NEXT:    retq
535  %vec = load <8 x float>, <8 x float>* %vp
536  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
537  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
538  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
539  ret <8 x float> %res
540}
541define <8 x float> @test_masked_8xfloat_dup_high_mem_mask4(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
542; CHECK-LABEL: test_masked_8xfloat_dup_high_mem_mask4:
543; CHECK:       # %bb.0:
544; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
545; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
546; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} = mem[1,1,3,3,5,5,7,7]
547; CHECK-NEXT:    retq
548  %vec = load <8 x float>, <8 x float>* %vp
549  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
550  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
551  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
552  ret <8 x float> %res
553}
554
555define <8 x float> @test_masked_z_8xfloat_dup_high_mem_mask4(<8 x float>* %vp, <8 x float> %mask) {
556; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mem_mask4:
557; CHECK:       # %bb.0:
558; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
559; CHECK-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
560; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7]
561; CHECK-NEXT:    retq
562  %vec = load <8 x float>, <8 x float>* %vp
563  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
564  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
565  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
566  ret <8 x float> %res
567}
568define <16 x float> @test_16xfloat_dup_high(<16 x float> %vec) {
569; CHECK-LABEL: test_16xfloat_dup_high:
570; CHECK:       # %bb.0:
571; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
572; CHECK-NEXT:    retq
573  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
574  ret <16 x float> %res
575}
576define <16 x float> @test_masked_16xfloat_dup_high_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
577; CHECK-LABEL: test_masked_16xfloat_dup_high_mask0:
578; CHECK:       # %bb.0:
579; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
580; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
581; CHECK-NEXT:    vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
582; CHECK-NEXT:    vmovaps %zmm1, %zmm0
583; CHECK-NEXT:    retq
584  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
585  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
586  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
587  ret <16 x float> %res
588}
589
590define <16 x float> @test_masked_z_16xfloat_dup_high_mask0(<16 x float> %vec, <16 x float> %mask) {
591; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mask0:
592; CHECK:       # %bb.0:
593; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
594; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
595; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
596; CHECK-NEXT:    retq
597  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
598  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
599  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
600  ret <16 x float> %res
601}
602define <16 x float> @test_masked_16xfloat_dup_high_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
603; CHECK-LABEL: test_masked_16xfloat_dup_high_mask1:
604; CHECK:       # %bb.0:
605; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
606; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
607; CHECK-NEXT:    vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
608; CHECK-NEXT:    vmovaps %zmm1, %zmm0
609; CHECK-NEXT:    retq
610  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
611  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
612  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
613  ret <16 x float> %res
614}
615
616define <16 x float> @test_masked_z_16xfloat_dup_high_mask1(<16 x float> %vec, <16 x float> %mask) {
617; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mask1:
618; CHECK:       # %bb.0:
619; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
620; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
621; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
622; CHECK-NEXT:    retq
623  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
624  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
625  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
626  ret <16 x float> %res
627}
628define <16 x float> @test_masked_16xfloat_dup_high_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
629; CHECK-LABEL: test_masked_16xfloat_dup_high_mask2:
630; CHECK:       # %bb.0:
631; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
632; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
633; CHECK-NEXT:    vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
634; CHECK-NEXT:    vmovaps %zmm1, %zmm0
635; CHECK-NEXT:    retq
636  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
637  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
638  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
639  ret <16 x float> %res
640}
641
642define <16 x float> @test_masked_z_16xfloat_dup_high_mask2(<16 x float> %vec, <16 x float> %mask) {
643; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mask2:
644; CHECK:       # %bb.0:
645; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
646; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
647; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
648; CHECK-NEXT:    retq
649  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
650  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
651  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
652  ret <16 x float> %res
653}
654define <16 x float> @test_masked_16xfloat_dup_high_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
655; CHECK-LABEL: test_masked_16xfloat_dup_high_mask3:
656; CHECK:       # %bb.0:
657; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
658; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
659; CHECK-NEXT:    vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
660; CHECK-NEXT:    vmovaps %zmm1, %zmm0
661; CHECK-NEXT:    retq
662  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
663  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
664  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
665  ret <16 x float> %res
666}
667
668define <16 x float> @test_masked_z_16xfloat_dup_high_mask3(<16 x float> %vec, <16 x float> %mask) {
669; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mask3:
670; CHECK:       # %bb.0:
671; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
672; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
673; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
674; CHECK-NEXT:    retq
675  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
676  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
677  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
678  ret <16 x float> %res
679}
680define <16 x float> @test_masked_16xfloat_dup_high_mask4(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
681; CHECK-LABEL: test_masked_16xfloat_dup_high_mask4:
682; CHECK:       # %bb.0:
683; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
684; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
685; CHECK-NEXT:    vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
686; CHECK-NEXT:    vmovaps %zmm1, %zmm0
687; CHECK-NEXT:    retq
688  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
689  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
690  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
691  ret <16 x float> %res
692}
693
694define <16 x float> @test_masked_z_16xfloat_dup_high_mask4(<16 x float> %vec, <16 x float> %mask) {
695; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mask4:
696; CHECK:       # %bb.0:
697; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
698; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
699; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
700; CHECK-NEXT:    retq
701  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
702  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
703  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
704  ret <16 x float> %res
705}
706define <16 x float> @test_16xfloat_dup_high_mem(<16 x float>* %vp) {
707; CHECK-LABEL: test_16xfloat_dup_high_mem:
708; CHECK:       # %bb.0:
709; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
710; CHECK-NEXT:    retq
711  %vec = load <16 x float>, <16 x float>* %vp
712  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
713  ret <16 x float> %res
714}
715define <16 x float> @test_masked_16xfloat_dup_high_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
716; CHECK-LABEL: test_masked_16xfloat_dup_high_mem_mask0:
717; CHECK:       # %bb.0:
718; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
719; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
720; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
721; CHECK-NEXT:    retq
722  %vec = load <16 x float>, <16 x float>* %vp
723  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
724  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
725  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
726  ret <16 x float> %res
727}
728
729define <16 x float> @test_masked_z_16xfloat_dup_high_mem_mask0(<16 x float>* %vp, <16 x float> %mask) {
730; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mem_mask0:
731; CHECK:       # %bb.0:
732; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
733; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
734; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
735; CHECK-NEXT:    retq
736  %vec = load <16 x float>, <16 x float>* %vp
737  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
738  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
739  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
740  ret <16 x float> %res
741}
742define <16 x float> @test_masked_16xfloat_dup_high_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
743; CHECK-LABEL: test_masked_16xfloat_dup_high_mem_mask1:
744; CHECK:       # %bb.0:
745; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
746; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
747; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
748; CHECK-NEXT:    retq
749  %vec = load <16 x float>, <16 x float>* %vp
750  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
751  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
752  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
753  ret <16 x float> %res
754}
755
756define <16 x float> @test_masked_z_16xfloat_dup_high_mem_mask1(<16 x float>* %vp, <16 x float> %mask) {
757; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mem_mask1:
758; CHECK:       # %bb.0:
759; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
760; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
761; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
762; CHECK-NEXT:    retq
763  %vec = load <16 x float>, <16 x float>* %vp
764  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
765  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
766  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
767  ret <16 x float> %res
768}
769define <16 x float> @test_masked_16xfloat_dup_high_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
770; CHECK-LABEL: test_masked_16xfloat_dup_high_mem_mask2:
771; CHECK:       # %bb.0:
772; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
773; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
774; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
775; CHECK-NEXT:    retq
776  %vec = load <16 x float>, <16 x float>* %vp
777  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
778  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
779  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
780  ret <16 x float> %res
781}
782
783define <16 x float> @test_masked_z_16xfloat_dup_high_mem_mask2(<16 x float>* %vp, <16 x float> %mask) {
784; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mem_mask2:
785; CHECK:       # %bb.0:
786; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
787; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
788; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
789; CHECK-NEXT:    retq
790  %vec = load <16 x float>, <16 x float>* %vp
791  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
792  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
793  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
794  ret <16 x float> %res
795}
796define <16 x float> @test_masked_16xfloat_dup_high_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
797; CHECK-LABEL: test_masked_16xfloat_dup_high_mem_mask3:
798; CHECK:       # %bb.0:
799; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
800; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
801; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
802; CHECK-NEXT:    retq
803  %vec = load <16 x float>, <16 x float>* %vp
804  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
805  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
806  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
807  ret <16 x float> %res
808}
809
810define <16 x float> @test_masked_z_16xfloat_dup_high_mem_mask3(<16 x float>* %vp, <16 x float> %mask) {
811; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mem_mask3:
812; CHECK:       # %bb.0:
813; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
814; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
815; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
816; CHECK-NEXT:    retq
817  %vec = load <16 x float>, <16 x float>* %vp
818  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
819  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
820  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
821  ret <16 x float> %res
822}
823define <16 x float> @test_masked_16xfloat_dup_high_mem_mask4(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
824; CHECK-LABEL: test_masked_16xfloat_dup_high_mem_mask4:
825; CHECK:       # %bb.0:
826; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
827; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
828; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
829; CHECK-NEXT:    retq
830  %vec = load <16 x float>, <16 x float>* %vp
831  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
832  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
833  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
834  ret <16 x float> %res
835}
836
837define <16 x float> @test_masked_z_16xfloat_dup_high_mem_mask4(<16 x float>* %vp, <16 x float> %mask) {
838; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mem_mask4:
839; CHECK:       # %bb.0:
840; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
841; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
842; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
843; CHECK-NEXT:    retq
844  %vec = load <16 x float>, <16 x float>* %vp
845  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
846  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
847  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
848  ret <16 x float> %res
849}
850