1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5
6;
7; DemandedBits - MOVMSK zeros the upper bits of the result.
8;
9
10define i32 @test_upper_x86_mmx_pmovmskb(x86_mmx %a0) {
11; CHECK-LABEL: @test_upper_x86_mmx_pmovmskb(
12; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx [[A0:%.*]])
13; CHECK-NEXT:    ret i32 [[TMP1]]
14;
15  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
16  %2 = and i32 %1, 255
17  ret i32 %2
18}
19
20define i32 @test_upper_x86_sse_movmsk_ps(<4 x float> %a0) {
21; CHECK-LABEL: @test_upper_x86_sse_movmsk_ps(
22; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[A0:%.*]] to <4 x i32>
23; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer
24; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
25; CHECK-NEXT:    [[TMP4:%.*]] = zext i4 [[TMP3]] to i32
26; CHECK-NEXT:    ret i32 [[TMP4]]
27;
28  %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
29  %2 = and i32 %1, 15
30  ret i32 %2
31}
32
33define i32 @test_upper_x86_sse2_movmsk_pd(<2 x double> %a0) {
34; CHECK-LABEL: @test_upper_x86_sse2_movmsk_pd(
35; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[A0:%.*]] to <2 x i64>
36; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <2 x i64> [[TMP1]], zeroinitializer
37; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i1> [[TMP2]] to i2
38; CHECK-NEXT:    [[TMP4:%.*]] = zext i2 [[TMP3]] to i32
39; CHECK-NEXT:    ret i32 [[TMP4]]
40;
41  %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
42  %2 = and i32 %1, 3
43  ret i32 %2
44}
45
46define i32 @test_upper_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
47; CHECK-LABEL: @test_upper_x86_sse2_pmovmskb_128(
48; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <16 x i8> [[A0:%.*]], zeroinitializer
49; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i1> [[TMP1]] to i16
50; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
51; CHECK-NEXT:    ret i32 [[TMP3]]
52;
53  %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0)
54  %2 = and i32 %1, 65535
55  ret i32 %2
56}
57
58define i32 @test_upper_x86_avx_movmsk_ps_256(<8 x float> %a0) {
59; CHECK-LABEL: @test_upper_x86_avx_movmsk_ps_256(
60; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[A0:%.*]] to <8 x i32>
61; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <8 x i32> [[TMP1]], zeroinitializer
62; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i1> [[TMP2]] to i8
63; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
64; CHECK-NEXT:    ret i32 [[TMP4]]
65;
66  %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
67  %2 = and i32 %1, 255
68  ret i32 %2
69}
70
71define i32 @test_upper_x86_avx_movmsk_pd_256(<4 x double> %a0) {
72; CHECK-LABEL: @test_upper_x86_avx_movmsk_pd_256(
73; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[A0:%.*]] to <4 x i64>
74; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <4 x i64> [[TMP1]], zeroinitializer
75; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
76; CHECK-NEXT:    [[TMP4:%.*]] = zext i4 [[TMP3]] to i32
77; CHECK-NEXT:    ret i32 [[TMP4]]
78;
79  %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
80  %2 = and i32 %1, 15
81  ret i32 %2
82}
83
84; llvm.x86.avx2.pmovmskb uses the whole of the 32-bit register.
85
86;
87; DemandedBits - If we don't use the lower bits then we just return zero.
88;
89
90define i32 @test_lower_x86_mmx_pmovmskb(x86_mmx %a0) {
91; CHECK-LABEL: @test_lower_x86_mmx_pmovmskb(
92; CHECK-NEXT:    ret i32 0
93;
94  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
95  %2 = and i32 %1, -256
96  ret i32 %2
97}
98
99define i32 @test_lower_x86_sse_movmsk_ps(<4 x float> %a0) {
100; CHECK-LABEL: @test_lower_x86_sse_movmsk_ps(
101; CHECK-NEXT:    ret i32 0
102;
103  %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
104  %2 = and i32 %1, -16
105  ret i32 %2
106}
107
108define i32 @test_lower_x86_sse2_movmsk_pd(<2 x double> %a0) {
109; CHECK-LABEL: @test_lower_x86_sse2_movmsk_pd(
110; CHECK-NEXT:    ret i32 0
111;
112  %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
113  %2 = and i32 %1, -4
114  ret i32 %2
115}
116
117define i32 @test_lower_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
118; CHECK-LABEL: @test_lower_x86_sse2_pmovmskb_128(
119; CHECK-NEXT:    ret i32 0
120;
121  %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0)
122  %2 = and i32 %1, -65536
123  ret i32 %2
124}
125
126define i32 @test_lower_x86_avx_movmsk_ps_256(<8 x float> %a0) {
127; CHECK-LABEL: @test_lower_x86_avx_movmsk_ps_256(
128; CHECK-NEXT:    ret i32 0
129;
130  %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
131  %2 = and i32 %1, -256
132  ret i32 %2
133}
134
135define i32 @test_lower_x86_avx_movmsk_pd_256(<4 x double> %a0) {
136; CHECK-LABEL: @test_lower_x86_avx_movmsk_pd_256(
137; CHECK-NEXT:    ret i32 0
138;
139  %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
140  %2 = and i32 %1, -16
141  ret i32 %2
142}
143
144; llvm.x86.avx2.pmovmskb uses the whole of the 32-bit register.
145
146;
147; Constant Folding (UNDEF -> ZERO)
148;
149
150define i32 @undef_x86_mmx_pmovmskb() {
151; CHECK-LABEL: @undef_x86_mmx_pmovmskb(
152; CHECK-NEXT:    ret i32 0
153;
154  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx undef)
155  ret i32 %1
156}
157
158define i32 @undef_x86_sse_movmsk_ps() {
159; CHECK-LABEL: @undef_x86_sse_movmsk_ps(
160; CHECK-NEXT:    ret i32 0
161;
162  %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> undef)
163  ret i32 %1
164}
165
166define i32 @undef_x86_sse2_movmsk_pd() {
167; CHECK-LABEL: @undef_x86_sse2_movmsk_pd(
168; CHECK-NEXT:    ret i32 0
169;
170  %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> undef)
171  ret i32 %1
172}
173
174define i32 @undef_x86_sse2_pmovmskb_128() {
175; CHECK-LABEL: @undef_x86_sse2_pmovmskb_128(
176; CHECK-NEXT:    ret i32 0
177;
178  %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> undef)
179  ret i32 %1
180}
181
182define i32 @undef_x86_avx_movmsk_ps_256() {
183; CHECK-LABEL: @undef_x86_avx_movmsk_ps_256(
184; CHECK-NEXT:    ret i32 0
185;
186  %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> undef)
187  ret i32 %1
188}
189
190define i32 @undef_x86_avx_movmsk_pd_256() {
191; CHECK-LABEL: @undef_x86_avx_movmsk_pd_256(
192; CHECK-NEXT:    ret i32 0
193;
194  %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> undef)
195  ret i32 %1
196}
197
198define i32 @undef_x86_avx2_pmovmskb() {
199; CHECK-LABEL: @undef_x86_avx2_pmovmskb(
200; CHECK-NEXT:    ret i32 0
201;
202  %1 = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> undef)
203  ret i32 %1
204}
205
206;
207; Constant Folding (ZERO -> ZERO)
208;
209
210define i32 @zero_x86_mmx_pmovmskb() {
211; CHECK-LABEL: @zero_x86_mmx_pmovmskb(
212; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<1 x i64> zeroinitializer to x86_mmx))
213; CHECK-NEXT:    ret i32 [[TMP1]]
214;
215  %1 = bitcast <1 x i64> zeroinitializer to x86_mmx
216  %2 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %1)
217  ret i32 %2
218}
219
220define i32 @zero_x86_sse_movmsk_ps() {
221; CHECK-LABEL: @zero_x86_sse_movmsk_ps(
222; CHECK-NEXT:    ret i32 0
223;
224  %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> zeroinitializer)
225  ret i32 %1
226}
227
228define i32 @zero_x86_sse2_movmsk_pd() {
229; CHECK-LABEL: @zero_x86_sse2_movmsk_pd(
230; CHECK-NEXT:    ret i32 0
231;
232  %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> zeroinitializer)
233  ret i32 %1
234}
235
236define i32 @zero_x86_sse2_pmovmskb_128() {
237; CHECK-LABEL: @zero_x86_sse2_pmovmskb_128(
238; CHECK-NEXT:    ret i32 0
239;
240  %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> zeroinitializer)
241  ret i32 %1
242}
243
244define i32 @zero_x86_avx_movmsk_ps_256() {
245; CHECK-LABEL: @zero_x86_avx_movmsk_ps_256(
246; CHECK-NEXT:    ret i32 0
247;
248  %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> zeroinitializer)
249  ret i32 %1
250}
251
252define i32 @zero_x86_avx_movmsk_pd_256() {
253; CHECK-LABEL: @zero_x86_avx_movmsk_pd_256(
254; CHECK-NEXT:    ret i32 0
255;
256  %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> zeroinitializer)
257  ret i32 %1
258}
259
260define i32 @zero_x86_avx2_pmovmskb() {
261; CHECK-LABEL: @zero_x86_avx2_pmovmskb(
262; CHECK-NEXT:    ret i32 0
263;
264  %1 = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> zeroinitializer)
265  ret i32 %1
266}
267
268;
269; Constant Folding
270;
271
272define i32 @fold_x86_mmx_pmovmskb() {
273; CHECK-LABEL: @fold_x86_mmx_pmovmskb(
274; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<8 x i8> <i8 0, i8 -1, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 0> to x86_mmx))
275; CHECK-NEXT:    ret i32 [[TMP1]]
276;
277  %1 = bitcast <8 x i8> <i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256> to x86_mmx
278  %2 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %1)
279  ret i32 %2
280}
281
282define i32 @fold_x86_sse_movmsk_ps() {
283; CHECK-LABEL: @fold_x86_sse_movmsk_ps(
284; CHECK-NEXT:    ret i32 10
285;
286  %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> <float 1.0, float -1.0, float 100.0, float -200.0>)
287  ret i32 %1
288}
289
290define i32 @fold_x86_sse2_movmsk_pd() {
291; CHECK-LABEL: @fold_x86_sse2_movmsk_pd(
292; CHECK-NEXT:    ret i32 2
293;
294  %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> <double 1.0, double -1.0>)
295  ret i32 %1
296}
297
298define i32 @fold_x86_sse2_pmovmskb_128() {
299; CHECK-LABEL: @fold_x86_sse2_pmovmskb_128(
300; CHECK-NEXT:    ret i32 5654
301;
302  %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> <i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256, i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256>)
303  ret i32 %1
304}
305
306define i32 @fold_x86_avx_movmsk_ps_256() {
307; CHECK-LABEL: @fold_x86_avx_movmsk_ps_256(
308; CHECK-NEXT:    ret i32 170
309;
310  %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> <float 1.0, float -1.0, float 100.0, float -200.0, float +0.0, float -0.0, float 100000.0, float -5000000.0>)
311  ret i32 %1
312}
313
314define i32 @fold_x86_avx_movmsk_pd_256() {
315; CHECK-LABEL: @fold_x86_avx_movmsk_pd_256(
316; CHECK-NEXT:    ret i32 10
317;
318  %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> <double 1.0, double -1.0, double 100.0, double -200.0>)
319  ret i32 %1
320}
321
322define i32 @fold_x86_avx2_pmovmskb() {
323; CHECK-LABEL: @fold_x86_avx2_pmovmskb(
324; CHECK-NEXT:    ret i32 370546176
325;
326  %1 = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256, i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256, i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256>)
327  ret i32 %1
328}
329
330define i32 @sext_sse_movmsk_ps(<4 x i1> %x) {
331; CHECK-LABEL: @sext_sse_movmsk_ps(
332; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i1> [[X:%.*]] to i4
333; CHECK-NEXT:    [[TMP2:%.*]] = zext i4 [[TMP1]] to i32
334; CHECK-NEXT:    ret i32 [[TMP2]]
335;
336  %sext = sext <4 x i1> %x to <4 x i32>
337  %bc = bitcast <4 x i32> %sext to <4 x float>
338  %r = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %bc)
339  ret i32 %r
340}
341
342define i32 @sext_sse2_movmsk_pd(<2 x i1> %x) {
343; CHECK-LABEL: @sext_sse2_movmsk_pd(
344; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i1> [[X:%.*]] to i2
345; CHECK-NEXT:    [[TMP2:%.*]] = zext i2 [[TMP1]] to i32
346; CHECK-NEXT:    ret i32 [[TMP2]]
347;
348  %sext = sext <2 x i1> %x to <2 x i64>
349  %bc = bitcast <2 x i64> %sext to <2 x double>
350  %r = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %bc)
351  ret i32 %r
352}
353
354define i32 @sext_sse2_pmovmskb_128(<16 x i1> %x) {
355; CHECK-LABEL: @sext_sse2_pmovmskb_128(
356; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i1> [[X:%.*]] to i16
357; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
358; CHECK-NEXT:    ret i32 [[TMP2]]
359;
360  %sext = sext <16 x i1> %x to <16 x i8>
361  %r = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %sext)
362  ret i32 %r
363}
364
365define i32 @sext_avx_movmsk_ps_256(<8 x i1> %x) {
366; CHECK-LABEL: @sext_avx_movmsk_ps_256(
367; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i1> [[X:%.*]] to i8
368; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
369; CHECK-NEXT:    ret i32 [[TMP2]]
370;
371  %sext = sext <8 x i1> %x to <8 x i32>
372  %bc = bitcast <8 x i32> %sext to <8 x float>
373  %r = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %bc)
374  ret i32 %r
375}
376
377define i32 @sext_avx_movmsk_pd_256(<4 x i1> %x) {
378; CHECK-LABEL: @sext_avx_movmsk_pd_256(
379; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i1> [[X:%.*]] to i4
380; CHECK-NEXT:    [[TMP2:%.*]] = zext i4 [[TMP1]] to i32
381; CHECK-NEXT:    ret i32 [[TMP2]]
382;
383  %sext = sext <4 x i1> %x to <4 x i64>
384  %bc = bitcast <4 x i64> %sext to <4 x double>
385  %r = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %bc)
386  ret i32 %r
387}
388
389define i32 @sext_avx2_pmovmskb(<32 x i1> %x) {
390; CHECK-LABEL: @sext_avx2_pmovmskb(
391; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x i1> [[X:%.*]] to i32
392; CHECK-NEXT:    ret i32 [[TMP1]]
393;
394  %sext = sext <32 x i1> %x to <32 x i8>
395  %r = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %sext)
396  ret i32 %r
397}
398
399; Bitcast from sign-extended scalar.
400
401define i32 @sext_sse_movmsk_ps_scalar_source(i1 %x) {
402; CHECK-LABEL: @sext_sse_movmsk_ps_scalar_source(
403; CHECK-NEXT:    [[SEXT:%.*]] = sext i1 [[X:%.*]] to i128
404; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[SEXT]] to <4 x i32>
405; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer
406; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
407; CHECK-NEXT:    [[TMP4:%.*]] = zext i4 [[TMP3]] to i32
408; CHECK-NEXT:    ret i32 [[TMP4]]
409;
410  %sext = sext i1 %x to i128
411  %bc = bitcast i128 %sext to <4 x float>
412  %r = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %bc)
413  ret i32 %r
414}
415
416; Bitcast from vector type with more elements.
417
418define i32 @sext_sse_movmsk_ps_too_many_elts(<8 x i1> %x) {
419; CHECK-LABEL: @sext_sse_movmsk_ps_too_many_elts(
420; CHECK-NEXT:    [[SEXT:%.*]] = sext <8 x i1> [[X:%.*]] to <8 x i16>
421; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SEXT]] to <4 x i32>
422; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer
423; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
424; CHECK-NEXT:    [[TMP4:%.*]] = zext i4 [[TMP3]] to i32
425; CHECK-NEXT:    ret i32 [[TMP4]]
426;
427  %sext = sext <8 x i1> %x to <8 x i16>
428  %bc = bitcast <8 x i16> %sext to <4 x float>
429  %r = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %bc)
430  ret i32 %r
431}
432
433; Handle this by doing a bitcasted sign-bit test after the sext.
434
435define i32 @sext_sse_movmsk_ps_must_replicate_bits(<2 x i1> %x) {
436; CHECK-LABEL: @sext_sse_movmsk_ps_must_replicate_bits(
437; CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[X:%.*]] to <2 x i64>
438; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[SEXT]] to <4 x i32>
439; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer
440; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
441; CHECK-NEXT:    [[TMP4:%.*]] = zext i4 [[TMP3]] to i32
442; CHECK-NEXT:    ret i32 [[TMP4]]
443;
444  %sext = sext <2 x i1> %x to <2 x i64>
445  %bc = bitcast <2 x i64> %sext to <4 x float>
446  %r = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %bc)
447  ret i32 %r
448}
449
450declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx)
451
452declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>)
453declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>)
454declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>)
455
456declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
457declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>)
458declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>)
459