1; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
2
3define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
4; CHECK-LABEL: test1:
5; CHECK:       ## BB#0:
6; CHECK-NEXT:    vcmpleps %zmm1, %zmm0, %k1
7; CHECK-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
8; CHECK-NEXT:    vmovaps %zmm1, %zmm0
9; CHECK-NEXT:    retq
10  %mask = fcmp ole <16 x float> %x, %y
11  %max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
12  ret <16 x float> %max
13}
14
15define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind {
16; CHECK-LABEL: test2:
17; CHECK:       ## BB#0:
18; CHECK-NEXT:    vcmplepd %zmm1, %zmm0, %k1
19; CHECK-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
20; CHECK-NEXT:    vmovaps %zmm1, %zmm0
21; CHECK-NEXT:    retq
22  %mask = fcmp ole <8 x double> %x, %y
23  %max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
24  ret <8 x double> %max
25}
26
27define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwind {
28; CHECK-LABEL: test3:
29; CHECK:       ## BB#0:
30; CHECK-NEXT:    vpcmpeqd (%rdi), %zmm0, %k1
31; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
32; CHECK-NEXT:    vmovaps %zmm1, %zmm0
33; CHECK-NEXT:    retq
34  %y = load <16 x i32>, <16 x i32>* %yp, align 4
35  %mask = icmp eq <16 x i32> %x, %y
36  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
37  ret <16 x i32> %max
38}
39
40define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind {
41; CHECK-LABEL: test4_unsigned:
42; CHECK:       ## BB#0:
43; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k1
44; CHECK-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
45; CHECK-NEXT:    vmovaps %zmm1, %zmm0
46; CHECK-NEXT:    retq
47  %mask = icmp uge <16 x i32> %x, %y
48  %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
49  ret <16 x i32> %max
50}
51
52define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind {
53; CHECK-LABEL: test5:
54; CHECK:       ## BB#0:
55; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
56; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
57; CHECK-NEXT:    vmovaps %zmm1, %zmm0
58; CHECK-NEXT:    retq
59  %mask = icmp eq <8 x i64> %x, %y
60  %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
61  ret <8 x i64> %max
62}
63
64define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1) nounwind {
65; CHECK-LABEL: test6_unsigned:
66; CHECK:       ## BB#0:
67; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1
68; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
69; CHECK-NEXT:    vmovaps %zmm1, %zmm0
70; CHECK-NEXT:    retq
71  %mask = icmp ugt <8 x i64> %x, %y
72  %max = select <8 x i1> %mask, <8 x i64> %x1, <8 x i64> %y
73  ret <8 x i64> %max
74}
75
76define <4 x float> @test7(<4 x float> %a, <4 x float> %b) {
77; CHECK-LABEL: test7:
78; CHECK:       ## BB#0:
79; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
80; CHECK-NEXT:    vcmpltps %xmm2, %xmm0, %xmm2
81; CHECK-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
82; CHECK-NEXT:    retq
83  %mask = fcmp olt <4 x float> %a, zeroinitializer
84  %c = select <4 x i1>%mask, <4 x float>%a, <4 x float>%b
85  ret <4 x float>%c
86}
87
88define <2 x double> @test8(<2 x double> %a, <2 x double> %b) {
89; CHECK-LABEL: test8:
90; CHECK:       ## BB#0:
91; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
92; CHECK-NEXT:    vcmpltpd %xmm2, %xmm0, %xmm2
93; CHECK-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
94; CHECK-NEXT:    retq
95  %mask = fcmp olt <2 x double> %a, zeroinitializer
96  %c = select <2 x i1>%mask, <2 x double>%a, <2 x double>%b
97  ret <2 x double>%c
98}
99
100define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind {
101; CHECK-LABEL: test9:
102; CHECK:       ## BB#0:
103; CHECK-NEXT:      ## kill: YMM1<def> YMM1<kill> ZMM1<def>
104; CHECK-NEXT:      ## kill: YMM0<def> YMM0<kill> ZMM0<def>
105; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
106; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
107; CHECK-NEXT:      ## kill: YMM0<def> YMM0<kill> ZMM0<kill>
108; CHECK-NEXT:    retq
109  %mask = icmp eq <8 x i32> %x, %y
110  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
111  ret <8 x i32> %max
112}
113
114define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind {
115; CHECK-LABEL: test10:
116; CHECK:       ## BB#0:
117; CHECK-NEXT:      ## kill: YMM1<def> YMM1<kill> ZMM1<def>
118; CHECK-NEXT:      ## kill: YMM0<def> YMM0<kill> ZMM0<def>
119; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
120; CHECK-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
121; CHECK-NEXT:      ## kill: YMM0<def> YMM0<kill> ZMM0<kill>
122; CHECK-NEXT:    retq
123  %mask = fcmp oeq <8 x float> %x, %y
124  %max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
125  ret <8 x float> %max
126}
127
128define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind {
129; CHECK-LABEL: test11_unsigned:
130; CHECK:       ## BB#0:
131; CHECK-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
132; CHECK-NEXT:    retq
133  %mask = icmp ugt <8 x i32> %x, %y
134  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
135  ret <8 x i32> %max
136}
137
138
139define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
140; CHECK-LABEL: test12:
141; CHECK:       ## BB#0:
142; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
143; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
144; CHECK-NEXT:    kunpckbw %k0, %k1, %k0
145; CHECK-NEXT:    kmovw %k0, %eax
146; CHECK-NEXT:      ## kill: AX<def> AX<kill> EAX<kill>
147; CHECK-NEXT:    retq
148  %res = icmp eq <16 x i64> %a, %b
149  %res1 = bitcast <16 x i1> %res to i16
150  ret i16 %res1
151}
152
153define <16 x i32> @test13(<16 x float>%a, <16 x float>%b)
154; CHECK-LABEL: test13:
155; CHECK:       ## BB#0:
156; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
157; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
158; CHECK-NEXT:    retq
159{
160  %cmpvector_i = fcmp oeq <16 x float> %a, %b
161  %conv = zext <16 x i1> %cmpvector_i to <16 x i32>
162  ret <16 x i32> %conv
163}
164
165define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) {
166; CHECK-LABEL: test14:
167; CHECK:       ## BB#0:
168; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm1
169; CHECK-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
170; CHECK-NEXT:    knotw %k0, %k0
171; CHECK-NEXT:    knotw %k0, %k1
172; CHECK-NEXT:    vmovdqu32 %zmm1, %zmm0 {%k1} {z}
173; CHECK-NEXT:    retq
174  %sub_r = sub <16 x i32> %a, %b
175  %cmp.i2.i = icmp sgt <16 x i32> %sub_r, %a
176  %sext.i3.i = sext <16 x i1> %cmp.i2.i to <16 x i32>
177  %mask = icmp eq <16 x i32> %sext.i3.i, zeroinitializer
178  %res = select <16 x i1> %mask, <16 x i32> zeroinitializer, <16 x i32> %sub_r
179  ret <16 x i32>%res
180}
181
182define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) {
183; CHECK-LABEL: test15:
184; CHECK:       ## BB#0:
185; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm1
186; CHECK-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
187; CHECK-NEXT:    knotw %k0, %k0
188; CHECK-NEXT:    knotw %k0, %k1
189; CHECK-NEXT:    vmovdqu64 %zmm1, %zmm0 {%k1} {z}
190; CHECK-NEXT:    retq
191  %sub_r = sub <8 x i64> %a, %b
192  %cmp.i2.i = icmp sgt <8 x i64> %sub_r, %a
193  %sext.i3.i = sext <8 x i1> %cmp.i2.i to <8 x i64>
194  %mask = icmp eq <8 x i64> %sext.i3.i, zeroinitializer
195  %res = select <8 x i1> %mask, <8 x i64> zeroinitializer, <8 x i64> %sub_r
196  ret <8 x i64>%res
197}
198
199define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind {
200; CHECK-LABEL: test16:
201; CHECK:       ## BB#0:
202; CHECK-NEXT:    vpcmpled %zmm0, %zmm1, %k1
203; CHECK-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
204; CHECK-NEXT:    vmovaps %zmm1, %zmm0
205; CHECK-NEXT:    retq
206  %mask = icmp sge <16 x i32> %x, %y
207  %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
208  ret <16 x i32> %max
209}
210
211define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
212; CHECK-LABEL: test17:
213; CHECK:       ## BB#0:
214; CHECK-NEXT:    vpcmpgtd (%rdi), %zmm0, %k1
215; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
216; CHECK-NEXT:    vmovaps %zmm1, %zmm0
217; CHECK-NEXT:    retq
218  %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
219  %mask = icmp sgt <16 x i32> %x, %y
220  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
221  ret <16 x i32> %max
222}
223
224define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
225; CHECK-LABEL: test18:
226; CHECK:       ## BB#0:
227; CHECK-NEXT:    vpcmpled (%rdi), %zmm0, %k1
228; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
229; CHECK-NEXT:    vmovaps %zmm1, %zmm0
230; CHECK-NEXT:    retq
231  %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
232  %mask = icmp sle <16 x i32> %x, %y
233  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
234  ret <16 x i32> %max
235}
236
237define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
238; CHECK-LABEL: test19:
239; CHECK:       ## BB#0:
240; CHECK-NEXT:    vpcmpleud (%rdi), %zmm0, %k1
241; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
242; CHECK-NEXT:    vmovaps %zmm1, %zmm0
243; CHECK-NEXT:    retq
244  %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
245  %mask = icmp ule <16 x i32> %x, %y
246  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
247  ret <16 x i32> %max
248}
249
250define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32> %y1) nounwind {
251; CHECK-LABEL: test20:
252; CHECK:       ## BB#0:
253; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
254; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 {%k1}
255; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
256; CHECK-NEXT:    vmovaps %zmm1, %zmm0
257; CHECK-NEXT:    retq
258  %mask1 = icmp eq <16 x i32> %x1, %y1
259  %mask0 = icmp eq <16 x i32> %x, %y
260  %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
261  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y
262  ret <16 x i32> %max
263}
264
265define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1) nounwind {
266; CHECK-LABEL: test21:
267; CHECK:       ## BB#0:
268; CHECK-NEXT:    vpcmpleq %zmm1, %zmm0, %k1
269; CHECK-NEXT:    vpcmpleq %zmm2, %zmm3, %k1 {%k1}
270; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
271; CHECK-NEXT:    vmovaps %zmm2, %zmm0
272; CHECK-NEXT:    retq
273  %mask1 = icmp sge <8 x i64> %x1, %y1
274  %mask0 = icmp sle <8 x i64> %x, %y
275  %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
276  %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1
277  ret <8 x i64> %max
278}
279
280define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind {
281; CHECK-LABEL: test22:
282; CHECK:       ## BB#0:
283; CHECK-NEXT:    vpcmpgtq %zmm2, %zmm1, %k1
284; CHECK-NEXT:    vpcmpgtq (%rdi), %zmm0, %k1 {%k1}
285; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
286; CHECK-NEXT:    vmovaps %zmm1, %zmm0
287; CHECK-NEXT:    retq
288  %mask1 = icmp sgt <8 x i64> %x1, %y1
289  %y = load <8 x i64>, <8 x i64>* %y.ptr, align 4
290  %mask0 = icmp sgt <8 x i64> %x, %y
291  %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
292  %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1
293  ret <8 x i64> %max
294}
295
296define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind {
297; CHECK-LABEL: test23:
298; CHECK:       ## BB#0:
299; CHECK-NEXT:    vpcmpled %zmm1, %zmm2, %k1
300; CHECK-NEXT:    vpcmpleud (%rdi), %zmm0, %k1 {%k1}
301; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
302; CHECK-NEXT:    vmovaps %zmm1, %zmm0
303; CHECK-NEXT:    retq
304  %mask1 = icmp sge <16 x i32> %x1, %y1
305  %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
306  %mask0 = icmp ule <16 x i32> %x, %y
307  %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
308  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
309  ret <16 x i32> %max
310}
311
312define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind {
313; CHECK-LABEL: test24:
314; CHECK:       ## BB#0:
315; CHECK-NEXT:    vpcmpeqq (%rdi){1to8}, %zmm0, %k1
316; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
317; CHECK-NEXT:    vmovaps %zmm1, %zmm0
318; CHECK-NEXT:    retq
319  %yb = load i64, i64* %yb.ptr, align 4
320  %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
321  %y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer
322  %mask = icmp eq <8 x i64> %x, %y
323  %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1
324  ret <8 x i64> %max
325}
326
327define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind {
328; CHECK-LABEL: test25:
329; CHECK:       ## BB#0:
330; CHECK-NEXT:    vpcmpled (%rdi){1to16}, %zmm0, %k1
331; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
332; CHECK-NEXT:    vmovaps %zmm1, %zmm0
333; CHECK-NEXT:    retq
334  %yb = load i32, i32* %yb.ptr, align 4
335  %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
336  %y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer
337  %mask = icmp sle <16 x i32> %x, %y
338  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
339  ret <16 x i32> %max
340}
341
342define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind {
343; CHECK-LABEL: test26:
344; CHECK:       ## BB#0:
345; CHECK-NEXT:    vpcmpled %zmm1, %zmm2, %k1
346; CHECK-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1}
347; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
348; CHECK-NEXT:    vmovaps %zmm1, %zmm0
349; CHECK-NEXT:    retq
350  %mask1 = icmp sge <16 x i32> %x1, %y1
351  %yb = load i32, i32* %yb.ptr, align 4
352  %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
353  %y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer
354  %mask0 = icmp sgt <16 x i32> %x, %y
355  %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
356  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
357  ret <16 x i32> %max
358}
359
360define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind {
361; CHECK-LABEL: test27:
362; CHECK:       ## BB#0:
363; CHECK-NEXT:    vpcmpleq        %zmm1, %zmm2, %k1
364; CHECK-NEXT:    vpcmpleq        (%rdi){1to8}, %zmm0, %k1 {%k1}
365; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
366; CHECK-NEXT:    vmovaps %zmm1, %zmm0
367; CHECK-NEXT:    retq
368  %mask1 = icmp sge <8 x i64> %x1, %y1
369  %yb = load i64, i64* %yb.ptr, align 4
370  %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
371  %y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer
372  %mask0 = icmp sle <8 x i64> %x, %y
373  %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
374  %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1
375  ret <8 x i64> %max
376}
377