1; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s
2
3; 256-bit
4
5define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) {
6; CHECK-LABEL: test_pcmpeq_d_256
7; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ##
8  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
9  ret i8 %res
10}
11
12define i8 @test_mask_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
13; CHECK-LABEL: test_mask_pcmpeq_d_256
14; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ##
15  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
16  ret i8 %res
17}
18
19declare i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32>, <8 x i32>, i8)
20
21define i8 @test_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b) {
22; CHECK-LABEL: test_pcmpeq_q_256
23; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ##
24  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
25  ret i8 %res
26}
27
28define i8 @test_mask_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
29; CHECK-LABEL: test_mask_pcmpeq_q_256
30; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ##
31  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
32  ret i8 %res
33}
34
35declare i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64>, <4 x i64>, i8)
36
37define i8 @test_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b) {
38; CHECK-LABEL: test_pcmpgt_d_256
39; CHECK: vpcmpgtd %ymm1, %ymm0, %k0 ##
40  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
41  ret i8 %res
42}
43
44define i8 @test_mask_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
45; CHECK-LABEL: test_mask_pcmpgt_d_256
46; CHECK: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} ##
47  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
48  ret i8 %res
49}
50
51declare i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32>, <8 x i32>, i8)
52
53define i8 @test_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b) {
54; CHECK-LABEL: test_pcmpgt_q_256
55; CHECK: vpcmpgtq %ymm1, %ymm0, %k0 ##
56  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
57  ret i8 %res
58}
59
60define i8 @test_mask_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
61; CHECK-LABEL: test_mask_pcmpgt_q_256
62; CHECK: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} ##
63  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
64  ret i8 %res
65}
66
67declare i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64>, <4 x i64>, i8)
68
69define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
70; CHECK-LABEL: test_cmp_d_256
71; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ##
72  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 0, i8 -1)
73  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
74; CHECK: vpcmpltd %ymm1, %ymm0, %k0 ##
75  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 1, i8 -1)
76  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
77; CHECK: vpcmpled %ymm1, %ymm0, %k0 ##
78  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 2, i8 -1)
79  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
80; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 ##
81  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 3, i8 -1)
82  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
83; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 ##
84  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 4, i8 -1)
85  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
86; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 ##
87  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 5, i8 -1)
88  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
89; CHECK: vpcmpnled %ymm1, %ymm0, %k0 ##
90  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 6, i8 -1)
91  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
92; CHECK: vpcmpordd %ymm1, %ymm0, %k0 ##
93  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 7, i8 -1)
94  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
95  ret <8 x i8> %vec7
96}
97
98define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
99; CHECK-LABEL: test_mask_cmp_d_256
100; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ##
101  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 0, i8 %mask)
102  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
103; CHECK: vpcmpltd %ymm1, %ymm0, %k0 {%k1} ##
104  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 1, i8 %mask)
105  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
106; CHECK: vpcmpled %ymm1, %ymm0, %k0 {%k1} ##
107  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 2, i8 %mask)
108  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
109; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 {%k1} ##
110  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 3, i8 %mask)
111  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
112; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 {%k1} ##
113  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 4, i8 %mask)
114  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
115; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 {%k1} ##
116  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 5, i8 %mask)
117  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
118; CHECK: vpcmpnled %ymm1, %ymm0, %k0 {%k1} ##
119  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 6, i8 %mask)
120  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
121; CHECK: vpcmpordd %ymm1, %ymm0, %k0 {%k1} ##
122  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 7, i8 %mask)
123  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
124  ret <8 x i8> %vec7
125}
126
127declare i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32>, <8 x i32>, i8, i8) nounwind readnone
128
129define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
130; CHECK-LABEL: test_ucmp_d_256
131; CHECK: vpcmpequd %ymm1, %ymm0, %k0 ##
132  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 0, i8 -1)
133  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
134; CHECK: vpcmpltud %ymm1, %ymm0, %k0 ##
135  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 1, i8 -1)
136  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
137; CHECK: vpcmpleud %ymm1, %ymm0, %k0 ##
138  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 2, i8 -1)
139  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
140; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 ##
141  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 3, i8 -1)
142  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
143; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 ##
144  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 4, i8 -1)
145  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
146; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 ##
147  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 5, i8 -1)
148  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
149; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 ##
150  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 6, i8 -1)
151  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
152; CHECK: vpcmpordud %ymm1, %ymm0, %k0 ##
153  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 7, i8 -1)
154  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
155  ret <8 x i8> %vec7
156}
157
158define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
159; CHECK-LABEL: test_mask_ucmp_d_256
160; CHECK: vpcmpequd %ymm1, %ymm0, %k0 {%k1} ##
161  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 0, i8 %mask)
162  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
163; CHECK: vpcmpltud %ymm1, %ymm0, %k0 {%k1} ##
164  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 1, i8 %mask)
165  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
166; CHECK: vpcmpleud %ymm1, %ymm0, %k0 {%k1} ##
167  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 2, i8 %mask)
168  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
169; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 {%k1} ##
170  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 3, i8 %mask)
171  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
172; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 {%k1} ##
173  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 4, i8 %mask)
174  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
175; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 {%k1} ##
176  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 5, i8 %mask)
177  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
178; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 {%k1} ##
179  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 6, i8 %mask)
180  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
181; CHECK: vpcmpordud %ymm1, %ymm0, %k0 {%k1} ##
182  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i8 7, i8 %mask)
183  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
184  ret <8 x i8> %vec7
185}
186
187declare i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32>, <8 x i32>, i8, i8) nounwind readnone
188
189define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
190; CHECK-LABEL: test_cmp_q_256
191; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ##
192  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 0, i8 -1)
193  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
194; CHECK: vpcmpltq %ymm1, %ymm0, %k0 ##
195  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 1, i8 -1)
196  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
197; CHECK: vpcmpleq %ymm1, %ymm0, %k0 ##
198  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 2, i8 -1)
199  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
200; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 ##
201  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 3, i8 -1)
202  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
203; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 ##
204  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 4, i8 -1)
205  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
206; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 ##
207  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 5, i8 -1)
208  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
209; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 ##
210  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 6, i8 -1)
211  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
212; CHECK: vpcmpordq %ymm1, %ymm0, %k0 ##
213  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 7, i8 -1)
214  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
215  ret <8 x i8> %vec7
216}
217
218define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
219; CHECK-LABEL: test_mask_cmp_q_256
220; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ##
221  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 0, i8 %mask)
222  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
223; CHECK: vpcmpltq %ymm1, %ymm0, %k0 {%k1} ##
224  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 1, i8 %mask)
225  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
226; CHECK: vpcmpleq %ymm1, %ymm0, %k0 {%k1} ##
227  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 2, i8 %mask)
228  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
229; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 {%k1} ##
230  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 3, i8 %mask)
231  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
232; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 {%k1} ##
233  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 4, i8 %mask)
234  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
235; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 {%k1} ##
236  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 5, i8 %mask)
237  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
238; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 {%k1} ##
239  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 6, i8 %mask)
240  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
241; CHECK: vpcmpordq %ymm1, %ymm0, %k0 {%k1} ##
242  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 7, i8 %mask)
243  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
244  ret <8 x i8> %vec7
245}
246
247declare i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64>, <4 x i64>, i8, i8) nounwind readnone
248
249define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
250; CHECK-LABEL: test_ucmp_q_256
251; CHECK: vpcmpequq %ymm1, %ymm0, %k0 ##
252  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 0, i8 -1)
253  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
254; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 ##
255  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 1, i8 -1)
256  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
257; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 ##
258  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 2, i8 -1)
259  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
260; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 ##
261  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 3, i8 -1)
262  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
263; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 ##
264  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 4, i8 -1)
265  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
266; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 ##
267  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 5, i8 -1)
268  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
269; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 ##
270  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 6, i8 -1)
271  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
272; CHECK: vpcmporduq %ymm1, %ymm0, %k0 ##
273  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 7, i8 -1)
274  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
275  ret <8 x i8> %vec7
276}
277
278define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
279; CHECK-LABEL: test_mask_ucmp_q_256
280; CHECK: vpcmpequq %ymm1, %ymm0, %k0 {%k1} ##
281  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 0, i8 %mask)
282  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
283; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} ##
284  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 1, i8 %mask)
285  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
286; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 {%k1} ##
287  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 2, i8 %mask)
288  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
289; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 {%k1} ##
290  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 3, i8 %mask)
291  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
292; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 {%k1} ##
293  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 4, i8 %mask)
294  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
295; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 {%k1} ##
296  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 5, i8 %mask)
297  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
298; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 {%k1} ##
299  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 6, i8 %mask)
300  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
301; CHECK: vpcmporduq %ymm1, %ymm0, %k0 {%k1} ##
302  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i8 7, i8 %mask)
303  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
304  ret <8 x i8> %vec7
305}
306
307declare i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64>, <4 x i64>, i8, i8) nounwind readnone
308
309; 128-bit
310
311define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) {
312; CHECK-LABEL: test_pcmpeq_d_128
313; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ##
314  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
315  ret i8 %res
316}
317
318define i8 @test_mask_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
319; CHECK-LABEL: test_mask_pcmpeq_d_128
320; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ##
321  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
322  ret i8 %res
323}
324
325declare i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32>, <4 x i32>, i8)
326
327define i8 @test_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b) {
328; CHECK-LABEL: test_pcmpeq_q_128
329; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ##
330  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
331  ret i8 %res
332}
333
334define i8 @test_mask_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
335; CHECK-LABEL: test_mask_pcmpeq_q_128
336; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ##
337  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
338  ret i8 %res
339}
340
341declare i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64>, <2 x i64>, i8)
342
343define i8 @test_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b) {
344; CHECK-LABEL: test_pcmpgt_d_128
345; CHECK: vpcmpgtd %xmm1, %xmm0, %k0 ##
346  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
347  ret i8 %res
348}
349
350define i8 @test_mask_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
351; CHECK-LABEL: test_mask_pcmpgt_d_128
352; CHECK: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} ##
353  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
354  ret i8 %res
355}
356
357declare i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32>, <4 x i32>, i8)
358
359define i8 @test_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b) {
360; CHECK-LABEL: test_pcmpgt_q_128
361; CHECK: vpcmpgtq %xmm1, %xmm0, %k0 ##
362  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
363  ret i8 %res
364}
365
366define i8 @test_mask_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
367; CHECK-LABEL: test_mask_pcmpgt_q_128
368; CHECK: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} ##
369  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
370  ret i8 %res
371}
372
373declare i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64>, <2 x i64>, i8)
374
375define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
376; CHECK-LABEL: test_cmp_d_128
377; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ##
378  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 0, i8 -1)
379  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
380; CHECK: vpcmpltd %xmm1, %xmm0, %k0 ##
381  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 1, i8 -1)
382  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
383; CHECK: vpcmpled %xmm1, %xmm0, %k0 ##
384  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 2, i8 -1)
385  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
386; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 ##
387  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 3, i8 -1)
388  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
389; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 ##
390  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 4, i8 -1)
391  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
392; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 ##
393  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 5, i8 -1)
394  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
395; CHECK: vpcmpnled %xmm1, %xmm0, %k0 ##
396  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 6, i8 -1)
397  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
398; CHECK: vpcmpordd %xmm1, %xmm0, %k0 ##
399  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 7, i8 -1)
400  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
401  ret <8 x i8> %vec7
402}
403
404define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
405; CHECK-LABEL: test_mask_cmp_d_128
406; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ##
407  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 0, i8 %mask)
408  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
409; CHECK: vpcmpltd %xmm1, %xmm0, %k0 {%k1} ##
410  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 1, i8 %mask)
411  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
412; CHECK: vpcmpled %xmm1, %xmm0, %k0 {%k1} ##
413  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 2, i8 %mask)
414  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
415; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 {%k1} ##
416  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 3, i8 %mask)
417  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
418; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 {%k1} ##
419  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 4, i8 %mask)
420  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
421; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 {%k1} ##
422  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 5, i8 %mask)
423  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
424; CHECK: vpcmpnled %xmm1, %xmm0, %k0 {%k1} ##
425  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 6, i8 %mask)
426  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
427; CHECK: vpcmpordd %xmm1, %xmm0, %k0 {%k1} ##
428  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 7, i8 %mask)
429  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
430  ret <8 x i8> %vec7
431}
432
433declare i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32>, <4 x i32>, i8, i8) nounwind readnone
434
435define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
436; CHECK-LABEL: test_ucmp_d_128
437; CHECK: vpcmpequd %xmm1, %xmm0, %k0 ##
438  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 0, i8 -1)
439  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
440; CHECK: vpcmpltud %xmm1, %xmm0, %k0 ##
441  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 1, i8 -1)
442  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
443; CHECK: vpcmpleud %xmm1, %xmm0, %k0 ##
444  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 2, i8 -1)
445  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
446; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 ##
447  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 3, i8 -1)
448  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
449; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 ##
450  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 4, i8 -1)
451  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
452; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 ##
453  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 5, i8 -1)
454  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
455; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 ##
456  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 6, i8 -1)
457  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
458; CHECK: vpcmpordud %xmm1, %xmm0, %k0 ##
459  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 7, i8 -1)
460  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
461  ret <8 x i8> %vec7
462}
463
464define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
465; CHECK-LABEL: test_mask_ucmp_d_128
466; CHECK: vpcmpequd %xmm1, %xmm0, %k0 {%k1} ##
467  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 0, i8 %mask)
468  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
469; CHECK: vpcmpltud %xmm1, %xmm0, %k0 {%k1} ##
470  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 1, i8 %mask)
471  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
472; CHECK: vpcmpleud %xmm1, %xmm0, %k0 {%k1} ##
473  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 2, i8 %mask)
474  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
475; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 {%k1} ##
476  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 3, i8 %mask)
477  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
478; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 {%k1} ##
479  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 4, i8 %mask)
480  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
481; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 {%k1} ##
482  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 5, i8 %mask)
483  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
484; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 {%k1} ##
485  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 6, i8 %mask)
486  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
487; CHECK: vpcmpordud %xmm1, %xmm0, %k0 {%k1} ##
488  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i8 7, i8 %mask)
489  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
490  ret <8 x i8> %vec7
491}
492
493declare i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32>, <4 x i32>, i8, i8) nounwind readnone
494
495define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
496; CHECK-LABEL: test_cmp_q_128
497; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ##
498  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 0, i8 -1)
499  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
500; CHECK: vpcmpltq %xmm1, %xmm0, %k0 ##
501  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 1, i8 -1)
502  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
503; CHECK: vpcmpleq %xmm1, %xmm0, %k0 ##
504  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 2, i8 -1)
505  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
506; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 ##
507  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 3, i8 -1)
508  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
509; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 ##
510  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 4, i8 -1)
511  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
512; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 ##
513  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 -1)
514  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
515; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 ##
516  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 6, i8 -1)
517  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
518; CHECK: vpcmpordq %xmm1, %xmm0, %k0 ##
519  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 7, i8 -1)
520  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
521  ret <8 x i8> %vec7
522}
523
524define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
525; CHECK-LABEL: test_mask_cmp_q_128
526; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ##
527  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 0, i8 %mask)
528  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
529; CHECK: vpcmpltq %xmm1, %xmm0, %k0 {%k1} ##
530  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 1, i8 %mask)
531  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
532; CHECK: vpcmpleq %xmm1, %xmm0, %k0 {%k1} ##
533  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 2, i8 %mask)
534  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
535; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 {%k1} ##
536  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 3, i8 %mask)
537  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
538; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 {%k1} ##
539  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 4, i8 %mask)
540  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
541; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 {%k1} ##
542  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 %mask)
543  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
544; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 {%k1} ##
545  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 6, i8 %mask)
546  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
547; CHECK: vpcmpordq %xmm1, %xmm0, %k0 {%k1} ##
548  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 7, i8 %mask)
549  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
550  ret <8 x i8> %vec7
551}
552
553declare i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64>, <2 x i64>, i8, i8) nounwind readnone
554
555define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
556; CHECK-LABEL: test_ucmp_q_128
557; CHECK: vpcmpequq %xmm1, %xmm0, %k0 ##
558  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 0, i8 -1)
559  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
560; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 ##
561  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 1, i8 -1)
562  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
563; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 ##
564  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 2, i8 -1)
565  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
566; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 ##
567  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 3, i8 -1)
568  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
569; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 ##
570  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 4, i8 -1)
571  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
572; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 ##
573  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 -1)
574  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
575; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 ##
576  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 6, i8 -1)
577  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
578; CHECK: vpcmporduq %xmm1, %xmm0, %k0 ##
579  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 7, i8 -1)
580  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
581  ret <8 x i8> %vec7
582}
583
584define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
585; CHECK-LABEL: test_mask_ucmp_q_128
586; CHECK: vpcmpequq %xmm1, %xmm0, %k0 {%k1} ##
587  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 0, i8 %mask)
588  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
589; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} ##
590  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 1, i8 %mask)
591  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
592; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 {%k1} ##
593  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 2, i8 %mask)
594  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
595; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 {%k1} ##
596  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 3, i8 %mask)
597  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
598; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 {%k1} ##
599  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 4, i8 %mask)
600  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
601; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 {%k1} ##
602  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 %mask)
603  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
604; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 {%k1} ##
605  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 6, i8 %mask)
606  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
607; CHECK: vpcmporduq %xmm1, %xmm0, %k0 {%k1} ##
608  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i8 7, i8 %mask)
609  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
610  ret <8 x i8> %vec7
611}
612
613declare i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64>, <2 x i64>, i8, i8) nounwind readnone
614
615; CHECK-LABEL: compr1
616; CHECK: vcompresspd %zmm0
617define void @compr1(i8* %addr, <8 x double> %data, i8 %mask) {
618  call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
619  ret void
620}
621
622declare void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
623
624; CHECK-LABEL: compr2
625; CHECK: vcompresspd %ymm0
626define void @compr2(i8* %addr, <4 x double> %data, i8 %mask) {
627  call void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
628  ret void
629}
630
631declare void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
632
633; CHECK-LABEL: compr3
634; CHECK: vcompressps %xmm0
635define void @compr3(i8* %addr, <4 x float> %data, i8 %mask) {
636  call void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
637  ret void
638}
639
640declare void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
641
642; CHECK-LABEL: compr4
643; CHECK: vcompresspd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x8a,0xc0]
644define <8 x double> @compr4(i8* %addr, <8 x double> %data, i8 %mask) {
645  %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
646  ret <8 x double> %res
647}
648
649declare <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
650
651; CHECK-LABEL: compr5
652; CHECK: vcompresspd %ymm0, %ymm1 {%k1}  ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0xc1]
653define <4 x double> @compr5(<4 x double> %data, <4 x double> %src0, i8 %mask) {
654  %res = call <4 x double> @llvm.x86.avx512.mask.compress.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask)
655  ret <4 x double> %res
656}
657
658declare <4 x double> @llvm.x86.avx512.mask.compress.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask)
659
660; CHECK-LABEL: compr6
661; CHECK: vcompressps %xmm0
662define <4 x float> @compr6(<4 x float> %data, i8 %mask) {
663  %res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask)
664  ret <4 x float> %res
665}
666
667declare <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask)
668
669; CHECK-LABEL: compr7
670; CHECK-NOT: vcompress
671; CHECK: vmovapd
672define void @compr7(i8* %addr, <8 x double> %data) {
673  call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 -1)
674  ret void
675}
676
677; CHECK-LABEL: compr8
678; CHECK-NOT: vcompressps %xmm0
679define <4 x float> @compr8(<4 x float> %data) {
680  %res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1)
681  ret <4 x float> %res
682}
683
684; CHECK-LABEL: compr9
685; CHECK: vpcompressq %zmm0, (%rdi) {%k1}  ## encoding: [0x62,0xf2,0xfd,0x49,0x8b,0x07]
686define void @compr9(i8* %addr, <8 x i64> %data, i8 %mask) {
687  call void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
688  ret void
689}
690
691declare void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
692
693; CHECK-LABEL: compr10
694; CHECK: vpcompressd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8b,0xc0]
695define <4 x i32> @compr10(<4 x i32> %data, i8 %mask) {
696  %res = call <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask)
697  ret <4 x i32> %res
698}
699
700declare <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask)
701
702; Expand
703
704; CHECK-LABEL: expand1
705; CHECK: vexpandpd (%rdi), %zmm0 {%k1}  ## encoding: [0x62,0xf2,0xfd,0x49,0x88,0x07]
706define <8 x double> @expand1(i8* %addr, <8 x double> %data, i8 %mask) {
707  %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
708  ret <8 x double> %res
709}
710
711declare <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
712
713; CHECK-LABEL: expand2
714; CHECK: vexpandpd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0x07]
715define <4 x double> @expand2(i8* %addr, <4 x double> %data, i8 %mask) {
716  %res = call <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
717  ret <4 x double> %res
718}
719
720declare <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
721
722; CHECK-LABEL: expand3
723; CHECK: vexpandps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x88,0x07]
724define <4 x float> @expand3(i8* %addr, <4 x float> %data, i8 %mask) {
725  %res = call <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
726  ret <4 x float> %res
727}
728
729declare <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
730
731; CHECK-LABEL: expand4
732; CHECK: vexpandpd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x88,0xc0]
733define <8 x double> @expand4(i8* %addr, <8 x double> %data, i8 %mask) {
734  %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
735  ret <8 x double> %res
736}
737
738declare <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
739
740; CHECK-LABEL: expand5
741; CHECK: vexpandpd %ymm0, %ymm1 {%k1}  ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0xc8]
742define <4 x double> @expand5(<4 x double> %data, <4 x double> %src0, i8 %mask) {
743  %res = call <4 x double> @llvm.x86.avx512.mask.expand.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask)
744  ret <4 x double> %res
745}
746
747declare <4 x double> @llvm.x86.avx512.mask.expand.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask)
748
749; CHECK-LABEL: expand6
750; CHECK: vexpandps %xmm0
751define <4 x float> @expand6(<4 x float> %data, i8 %mask) {
752  %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask)
753  ret <4 x float> %res
754}
755
756declare <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask)
757
758; CHECK-LABEL: expand7
759; CHECK-NOT: vexpand
760; CHECK: vmovapd
761define <8 x double> @expand7(i8* %addr, <8 x double> %data) {
762  %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 -1)
763  ret <8 x double> %res
764}
765
766; CHECK-LABEL: expand8
767; CHECK-NOT: vexpandps %xmm0
768define <4 x float> @expand8(<4 x float> %data) {
769  %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1)
770  ret <4 x float> %res
771}
772
773; CHECK-LABEL: expand9
774; CHECK: vpexpandq (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x89,0x07]
775define <8 x i64> @expand9(i8* %addr, <8 x i64> %data, i8 %mask) {
776  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
777  ret <8 x i64> %res
778}
779
780declare <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
781
782; CHECK-LABEL: expand10
783; CHECK: vpexpandd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x89,0xc0]
784define <4 x i32> @expand10(<4 x i32> %data, i8 %mask) {
785  %res = call <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask)
786  ret <4 x i32> %res
787}
788
789declare <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask)
790
791define <8 x float> @test_x86_mask_blend_ps_256(i8 %a0, <8 x float> %a1, <8 x float> %a2) {
792  ; CHECK: vblendmps %ymm1, %ymm0
793  %res = call <8 x float> @llvm.x86.avx512.mask.blend.ps.256(<8 x float> %a1, <8 x float> %a2, i8 %a0) ; <<8 x float>> [#uses=1]
794  ret <8 x float> %res
795}
796
797declare <8 x float> @llvm.x86.avx512.mask.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readonly
798
799define <4 x double> @test_x86_mask_blend_pd_256(i8 %a0, <4 x double> %a1, <4 x double> %a2) {
800  ; CHECK: vblendmpd %ymm1, %ymm0
801  %res = call <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double> %a1, <4 x double> %a2, i8 %a0) ; <<4 x double>> [#uses=1]
802  ret <4 x double> %res
803}
804
805define <4 x double> @test_x86_mask_blend_pd_256_memop(<4 x double> %a, <4 x double>* %ptr, i8 %mask) {
806  ; CHECK-LABEL: test_x86_mask_blend_pd_256_memop
807  ; CHECK: vblendmpd (%
808  %b = load <4 x double>, <4 x double>* %ptr
809  %res = call <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double> %a, <4 x double> %b, i8 %mask) ; <<4 x double>> [#uses=1]
810  ret <4 x double> %res
811}
812declare <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readonly
813
814; CHECK-LABEL: test_x86_mask_blend_d_256
815; CHECK: vpblendmd
816define <8 x i32> @test_x86_mask_blend_d_256(i8 %a0, <8 x i32> %a1, <8 x i32> %a2) {
817  %res = call <8 x i32> @llvm.x86.avx512.mask.blend.d.256(<8 x i32> %a1, <8 x i32> %a2, i8 %a0) ; <<8 x i32>> [#uses=1]
818  ret <8 x i32> %res
819}
820declare <8 x i32> @llvm.x86.avx512.mask.blend.d.256(<8 x i32>, <8 x i32>, i8) nounwind readonly
821
822define <4 x i64> @test_x86_mask_blend_q_256(i8 %a0, <4 x i64> %a1, <4 x i64> %a2) {
823  ; CHECK: vpblendmq
824  %res = call <4 x i64> @llvm.x86.avx512.mask.blend.q.256(<4 x i64> %a1, <4 x i64> %a2, i8 %a0) ; <<4 x i64>> [#uses=1]
825  ret <4 x i64> %res
826}
827declare <4 x i64> @llvm.x86.avx512.mask.blend.q.256(<4 x i64>, <4 x i64>, i8) nounwind readonly
828
829define <4 x float> @test_x86_mask_blend_ps_128(i8 %a0, <4 x float> %a1, <4 x float> %a2) {
830  ; CHECK: vblendmps %xmm1, %xmm0
831  %res = call <4 x float> @llvm.x86.avx512.mask.blend.ps.128(<4 x float> %a1, <4 x float> %a2, i8 %a0) ; <<4 x float>> [#uses=1]
832  ret <4 x float> %res
833}
834
835declare <4 x float> @llvm.x86.avx512.mask.blend.ps.128(<4 x float>, <4 x float>, i8) nounwind readonly
836
837define <2 x double> @test_x86_mask_blend_pd_128(i8 %a0, <2 x double> %a1, <2 x double> %a2) {
838  ; CHECK: vblendmpd %xmm1, %xmm0
839  %res = call <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double> %a1, <2 x double> %a2, i8 %a0) ; <<2 x double>> [#uses=1]
840  ret <2 x double> %res
841}
842
843define <2 x double> @test_x86_mask_blend_pd_128_memop(<2 x double> %a, <2 x double>* %ptr, i8 %mask) {
844  ; CHECK-LABEL: test_x86_mask_blend_pd_128_memop
845  ; CHECK: vblendmpd (%
846  %b = load <2 x double>, <2 x double>* %ptr
847  %res = call <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double> %a, <2 x double> %b, i8 %mask) ; <<2 x double>> [#uses=1]
848  ret <2 x double> %res
849}
850declare <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double>, <2 x double>, i8) nounwind readonly
851
852define <4 x i32> @test_x86_mask_blend_d_128(i8 %a0, <4 x i32> %a1, <4 x i32> %a2) {
853  ; CHECK: vpblendmd
854  %res = call <4 x i32> @llvm.x86.avx512.mask.blend.d.128(<4 x i32> %a1, <4 x i32> %a2, i8 %a0) ; <<4 x i32>> [#uses=1]
855  ret <4 x i32> %res
856}
857declare <4 x i32> @llvm.x86.avx512.mask.blend.d.128(<4 x i32>, <4 x i32>, i8) nounwind readonly
858
859define <2 x i64> @test_x86_mask_blend_q_128(i8 %a0, <2 x i64> %a1, <2 x i64> %a2) {
860  ; CHECK: vpblendmq
861  %res = call <2 x i64> @llvm.x86.avx512.mask.blend.q.128(<2 x i64> %a1, <2 x i64> %a2, i8 %a0) ; <<2 x i64>> [#uses=1]
862  ret <2 x i64> %res
863}
864declare <2 x i64> @llvm.x86.avx512.mask.blend.q.128(<2 x i64>, <2 x i64>, i8) nounwind readonly
865