1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,VEX,AVX1
5; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,VEX,AVX2
6; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
7; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL
8; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
9; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLDQ
10;
11; 32-bit tests to make sure we're not doing anything stupid.
12; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown
13; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+sse
14; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+sse2
15; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+sse4.1
16
17;
18; Signed Integer to Double
19;
20
21define <2 x float> @sitofp_2i32_to_2f32(<2 x i32> %a) {
22; SSE-LABEL: sitofp_2i32_to_2f32:
23; SSE:       # %bb.0:
24; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
25; SSE-NEXT:    retq
26;
27; AVX-LABEL: sitofp_2i32_to_2f32:
28; AVX:       # %bb.0:
29; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
30; AVX-NEXT:    retq
31  %cvt = sitofp <2 x i32> %a to <2 x float>
32  ret <2 x float> %cvt
33}
34
35define <2 x float> @uitofp_2i32_to_2f32(<2 x i32> %a) {
36; SSE2-LABEL: uitofp_2i32_to_2f32:
37; SSE2:       # %bb.0:
38; SSE2-NEXT:    xorpd %xmm1, %xmm1
39; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
40; SSE2-NEXT:    movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
41; SSE2-NEXT:    orpd %xmm1, %xmm0
42; SSE2-NEXT:    subpd %xmm1, %xmm0
43; SSE2-NEXT:    cvtpd2ps %xmm0, %xmm0
44; SSE2-NEXT:    retq
45;
46; SSE41-LABEL: uitofp_2i32_to_2f32:
47; SSE41:       # %bb.0:
48; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
49; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
50; SSE41-NEXT:    por %xmm1, %xmm0
51; SSE41-NEXT:    subpd %xmm1, %xmm0
52; SSE41-NEXT:    cvtpd2ps %xmm0, %xmm0
53; SSE41-NEXT:    retq
54;
55; VEX-LABEL: uitofp_2i32_to_2f32:
56; VEX:       # %bb.0:
57; VEX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
58; VEX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
59; VEX-NEXT:    vpor %xmm1, %xmm0, %xmm0
60; VEX-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
61; VEX-NEXT:    vcvtpd2ps %xmm0, %xmm0
62; VEX-NEXT:    retq
63;
64; AVX512F-LABEL: uitofp_2i32_to_2f32:
65; AVX512F:       # %bb.0:
66; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
67; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
68; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
69; AVX512F-NEXT:    vzeroupper
70; AVX512F-NEXT:    retq
71;
72; AVX512VL-LABEL: uitofp_2i32_to_2f32:
73; AVX512VL:       # %bb.0:
74; AVX512VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
75; AVX512VL-NEXT:    retq
76;
77; AVX512DQ-LABEL: uitofp_2i32_to_2f32:
78; AVX512DQ:       # %bb.0:
79; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
80; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
81; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
82; AVX512DQ-NEXT:    vzeroupper
83; AVX512DQ-NEXT:    retq
84;
85; AVX512VLDQ-LABEL: uitofp_2i32_to_2f32:
86; AVX512VLDQ:       # %bb.0:
87; AVX512VLDQ-NEXT:    vcvtudq2ps %xmm0, %xmm0
88; AVX512VLDQ-NEXT:    retq
89  %cvt = uitofp <2 x i32> %a to <2 x float>
90  ret <2 x float> %cvt
91}
92
93define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
94; SSE2-LABEL: sitofp_2i64_to_2f64:
95; SSE2:       # %bb.0:
96; SSE2-NEXT:    movq %xmm0, %rax
97; SSE2-NEXT:    cvtsi2sd %rax, %xmm1
98; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
99; SSE2-NEXT:    movq %xmm0, %rax
100; SSE2-NEXT:    xorps %xmm0, %xmm0
101; SSE2-NEXT:    cvtsi2sd %rax, %xmm0
102; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
103; SSE2-NEXT:    movapd %xmm1, %xmm0
104; SSE2-NEXT:    retq
105;
106; SSE41-LABEL: sitofp_2i64_to_2f64:
107; SSE41:       # %bb.0:
108; SSE41-NEXT:    pextrq $1, %xmm0, %rax
109; SSE41-NEXT:    cvtsi2sd %rax, %xmm1
110; SSE41-NEXT:    movq %xmm0, %rax
111; SSE41-NEXT:    xorps %xmm0, %xmm0
112; SSE41-NEXT:    cvtsi2sd %rax, %xmm0
113; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
114; SSE41-NEXT:    retq
115;
116; VEX-LABEL: sitofp_2i64_to_2f64:
117; VEX:       # %bb.0:
118; VEX-NEXT:    vpextrq $1, %xmm0, %rax
119; VEX-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
120; VEX-NEXT:    vmovq %xmm0, %rax
121; VEX-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
122; VEX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
123; VEX-NEXT:    retq
124;
125; AVX512F-LABEL: sitofp_2i64_to_2f64:
126; AVX512F:       # %bb.0:
127; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
128; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
129; AVX512F-NEXT:    vmovq %xmm0, %rax
130; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
131; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
132; AVX512F-NEXT:    retq
133;
134; AVX512VL-LABEL: sitofp_2i64_to_2f64:
135; AVX512VL:       # %bb.0:
136; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
137; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
138; AVX512VL-NEXT:    vmovq %xmm0, %rax
139; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
140; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
141; AVX512VL-NEXT:    retq
142;
143; AVX512DQ-LABEL: sitofp_2i64_to_2f64:
144; AVX512DQ:       # %bb.0:
145; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
146; AVX512DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
147; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
148; AVX512DQ-NEXT:    vzeroupper
149; AVX512DQ-NEXT:    retq
150;
151; AVX512VLDQ-LABEL: sitofp_2i64_to_2f64:
152; AVX512VLDQ:       # %bb.0:
153; AVX512VLDQ-NEXT:    vcvtqq2pd %xmm0, %xmm0
154; AVX512VLDQ-NEXT:    retq
155  %cvt = sitofp <2 x i64> %a to <2 x double>
156  ret <2 x double> %cvt
157}
158
159define <2 x double> @sitofp_2i32_to_2f64(<4 x i32> %a) {
160; SSE-LABEL: sitofp_2i32_to_2f64:
161; SSE:       # %bb.0:
162; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
163; SSE-NEXT:    retq
164;
165; AVX-LABEL: sitofp_2i32_to_2f64:
166; AVX:       # %bb.0:
167; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
168; AVX-NEXT:    retq
169  %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
170  %cvt = sitofp <2 x i32> %shuf to <2 x double>
171  ret <2 x double> %cvt
172}
173
174define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) {
175; SSE-LABEL: sitofp_4i32_to_2f64:
176; SSE:       # %bb.0:
177; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
178; SSE-NEXT:    retq
179;
180; AVX-LABEL: sitofp_4i32_to_2f64:
181; AVX:       # %bb.0:
182; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
183; AVX-NEXT:    retq
184  %cvt = sitofp <4 x i32> %a to <4 x double>
185  %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
186  ret <2 x double> %shuf
187}
188
189define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) {
190; SSE2-LABEL: sitofp_2i16_to_2f64:
191; SSE2:       # %bb.0:
192; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
193; SSE2-NEXT:    psrad $16, %xmm0
194; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
195; SSE2-NEXT:    retq
196;
197; SSE41-LABEL: sitofp_2i16_to_2f64:
198; SSE41:       # %bb.0:
199; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
200; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
201; SSE41-NEXT:    retq
202;
203; AVX-LABEL: sitofp_2i16_to_2f64:
204; AVX:       # %bb.0:
205; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
206; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
207; AVX-NEXT:    retq
208  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
209  %cvt = sitofp <2 x i16> %shuf to <2 x double>
210  ret <2 x double> %cvt
211}
212
213define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) {
214; SSE2-LABEL: sitofp_8i16_to_2f64:
215; SSE2:       # %bb.0:
216; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
217; SSE2-NEXT:    psrad $16, %xmm0
218; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
219; SSE2-NEXT:    retq
220;
221; SSE41-LABEL: sitofp_8i16_to_2f64:
222; SSE41:       # %bb.0:
223; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
224; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
225; SSE41-NEXT:    retq
226;
227; VEX-LABEL: sitofp_8i16_to_2f64:
228; VEX:       # %bb.0:
229; VEX-NEXT:    vpmovsxwd %xmm0, %xmm0
230; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
231; VEX-NEXT:    retq
232;
233; AVX512-LABEL: sitofp_8i16_to_2f64:
234; AVX512:       # %bb.0:
235; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
236; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
237; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
238; AVX512-NEXT:    vzeroupper
239; AVX512-NEXT:    retq
240  %cvt = sitofp <8 x i16> %a to <8 x double>
241  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
242  ret <2 x double> %shuf
243}
244
245define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) {
246; SSE2-LABEL: sitofp_2i8_to_2f64:
247; SSE2:       # %bb.0:
248; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
249; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
250; SSE2-NEXT:    psrad $24, %xmm0
251; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
252; SSE2-NEXT:    retq
253;
254; SSE41-LABEL: sitofp_2i8_to_2f64:
255; SSE41:       # %bb.0:
256; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
257; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
258; SSE41-NEXT:    retq
259;
260; AVX-LABEL: sitofp_2i8_to_2f64:
261; AVX:       # %bb.0:
262; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
263; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
264; AVX-NEXT:    retq
265  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
266  %cvt = sitofp <2 x i8> %shuf to <2 x double>
267  ret <2 x double> %cvt
268}
269
270define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
271; SSE2-LABEL: sitofp_16i8_to_2f64:
272; SSE2:       # %bb.0:
273; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
274; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
275; SSE2-NEXT:    psrad $24, %xmm0
276; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
277; SSE2-NEXT:    retq
278;
279; SSE41-LABEL: sitofp_16i8_to_2f64:
280; SSE41:       # %bb.0:
281; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
282; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
283; SSE41-NEXT:    retq
284;
285; VEX-LABEL: sitofp_16i8_to_2f64:
286; VEX:       # %bb.0:
287; VEX-NEXT:    vpmovsxbd %xmm0, %xmm0
288; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
289; VEX-NEXT:    retq
290;
291; AVX512-LABEL: sitofp_16i8_to_2f64:
292; AVX512:       # %bb.0:
293; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
294; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
295; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
296; AVX512-NEXT:    vzeroupper
297; AVX512-NEXT:    retq
298  %cvt = sitofp <16 x i8> %a to <16 x double>
299  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
300  ret <2 x double> %shuf
301}
302
303define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
304; SSE2-LABEL: sitofp_4i64_to_4f64:
305; SSE2:       # %bb.0:
306; SSE2-NEXT:    movq %xmm0, %rax
307; SSE2-NEXT:    cvtsi2sd %rax, %xmm2
308; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
309; SSE2-NEXT:    movq %xmm0, %rax
310; SSE2-NEXT:    xorps %xmm0, %xmm0
311; SSE2-NEXT:    cvtsi2sd %rax, %xmm0
312; SSE2-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
313; SSE2-NEXT:    movq %xmm1, %rax
314; SSE2-NEXT:    cvtsi2sd %rax, %xmm3
315; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
316; SSE2-NEXT:    movq %xmm0, %rax
317; SSE2-NEXT:    xorps %xmm0, %xmm0
318; SSE2-NEXT:    cvtsi2sd %rax, %xmm0
319; SSE2-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0]
320; SSE2-NEXT:    movapd %xmm2, %xmm0
321; SSE2-NEXT:    movapd %xmm3, %xmm1
322; SSE2-NEXT:    retq
323;
324; SSE41-LABEL: sitofp_4i64_to_4f64:
325; SSE41:       # %bb.0:
326; SSE41-NEXT:    pextrq $1, %xmm0, %rax
327; SSE41-NEXT:    cvtsi2sd %rax, %xmm2
328; SSE41-NEXT:    movq %xmm0, %rax
329; SSE41-NEXT:    xorps %xmm0, %xmm0
330; SSE41-NEXT:    cvtsi2sd %rax, %xmm0
331; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
332; SSE41-NEXT:    pextrq $1, %xmm1, %rax
333; SSE41-NEXT:    xorps %xmm2, %xmm2
334; SSE41-NEXT:    cvtsi2sd %rax, %xmm2
335; SSE41-NEXT:    movq %xmm1, %rax
336; SSE41-NEXT:    xorps %xmm1, %xmm1
337; SSE41-NEXT:    cvtsi2sd %rax, %xmm1
338; SSE41-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
339; SSE41-NEXT:    retq
340;
341; AVX1-LABEL: sitofp_4i64_to_4f64:
342; AVX1:       # %bb.0:
343; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
344; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
345; AVX1-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
346; AVX1-NEXT:    vmovq %xmm1, %rax
347; AVX1-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
348; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
349; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
350; AVX1-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
351; AVX1-NEXT:    vmovq %xmm0, %rax
352; AVX1-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
353; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
354; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
355; AVX1-NEXT:    retq
356;
357; AVX2-LABEL: sitofp_4i64_to_4f64:
358; AVX2:       # %bb.0:
359; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
360; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
361; AVX2-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
362; AVX2-NEXT:    vmovq %xmm1, %rax
363; AVX2-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
364; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
365; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
366; AVX2-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
367; AVX2-NEXT:    vmovq %xmm0, %rax
368; AVX2-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
369; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
370; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
371; AVX2-NEXT:    retq
372;
373; AVX512F-LABEL: sitofp_4i64_to_4f64:
374; AVX512F:       # %bb.0:
375; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
376; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
377; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
378; AVX512F-NEXT:    vmovq %xmm1, %rax
379; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
380; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
381; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
382; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
383; AVX512F-NEXT:    vmovq %xmm0, %rax
384; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
385; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
386; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
387; AVX512F-NEXT:    retq
388;
389; AVX512VL-LABEL: sitofp_4i64_to_4f64:
390; AVX512VL:       # %bb.0:
391; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
392; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
393; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
394; AVX512VL-NEXT:    vmovq %xmm1, %rax
395; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
396; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
397; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
398; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
399; AVX512VL-NEXT:    vmovq %xmm0, %rax
400; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
401; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
402; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
403; AVX512VL-NEXT:    retq
404;
405; AVX512DQ-LABEL: sitofp_4i64_to_4f64:
406; AVX512DQ:       # %bb.0:
407; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
408; AVX512DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
409; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
410; AVX512DQ-NEXT:    retq
411;
412; AVX512VLDQ-LABEL: sitofp_4i64_to_4f64:
413; AVX512VLDQ:       # %bb.0:
414; AVX512VLDQ-NEXT:    vcvtqq2pd %ymm0, %ymm0
415; AVX512VLDQ-NEXT:    retq
416  %cvt = sitofp <4 x i64> %a to <4 x double>
417  ret <4 x double> %cvt
418}
419
420define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) {
421; SSE-LABEL: sitofp_4i32_to_4f64:
422; SSE:       # %bb.0:
423; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
424; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
425; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
426; SSE-NEXT:    movaps %xmm2, %xmm0
427; SSE-NEXT:    retq
428;
429; AVX-LABEL: sitofp_4i32_to_4f64:
430; AVX:       # %bb.0:
431; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
432; AVX-NEXT:    retq
433  %cvt = sitofp <4 x i32> %a to <4 x double>
434  ret <4 x double> %cvt
435}
436
437define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) {
438; SSE2-LABEL: sitofp_4i16_to_4f64:
439; SSE2:       # %bb.0:
440; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
441; SSE2-NEXT:    psrad $16, %xmm1
442; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
443; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
444; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
445; SSE2-NEXT:    retq
446;
447; SSE41-LABEL: sitofp_4i16_to_4f64:
448; SSE41:       # %bb.0:
449; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
450; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
451; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
452; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
453; SSE41-NEXT:    retq
454;
455; AVX-LABEL: sitofp_4i16_to_4f64:
456; AVX:       # %bb.0:
457; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
458; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
459; AVX-NEXT:    retq
460  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
461  %cvt = sitofp <4 x i16> %shuf to <4 x double>
462  ret <4 x double> %cvt
463}
464
465define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) {
466; SSE2-LABEL: sitofp_8i16_to_4f64:
467; SSE2:       # %bb.0:
468; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
469; SSE2-NEXT:    psrad $16, %xmm1
470; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
471; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
472; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
473; SSE2-NEXT:    retq
474;
475; SSE41-LABEL: sitofp_8i16_to_4f64:
476; SSE41:       # %bb.0:
477; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
478; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
479; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
480; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
481; SSE41-NEXT:    retq
482;
483; VEX-LABEL: sitofp_8i16_to_4f64:
484; VEX:       # %bb.0:
485; VEX-NEXT:    vpmovsxwd %xmm0, %xmm0
486; VEX-NEXT:    vcvtdq2pd %xmm0, %ymm0
487; VEX-NEXT:    retq
488;
489; AVX512-LABEL: sitofp_8i16_to_4f64:
490; AVX512:       # %bb.0:
491; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
492; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
493; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
494; AVX512-NEXT:    retq
495  %cvt = sitofp <8 x i16> %a to <8 x double>
496  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
497  ret <4 x double> %shuf
498}
499
500define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) {
501; SSE2-LABEL: sitofp_4i8_to_4f64:
502; SSE2:       # %bb.0:
503; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
504; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
505; SSE2-NEXT:    psrad $24, %xmm1
506; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
507; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
508; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
509; SSE2-NEXT:    retq
510;
511; SSE41-LABEL: sitofp_4i8_to_4f64:
512; SSE41:       # %bb.0:
513; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
514; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
515; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
516; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
517; SSE41-NEXT:    retq
518;
519; AVX-LABEL: sitofp_4i8_to_4f64:
520; AVX:       # %bb.0:
521; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
522; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
523; AVX-NEXT:    retq
524  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
525  %cvt = sitofp <4 x i8> %shuf to <4 x double>
526  ret <4 x double> %cvt
527}
528
529define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
530; SSE2-LABEL: sitofp_16i8_to_4f64:
531; SSE2:       # %bb.0:
532; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
533; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
534; SSE2-NEXT:    psrad $24, %xmm1
535; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
536; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
537; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
538; SSE2-NEXT:    retq
539;
540; SSE41-LABEL: sitofp_16i8_to_4f64:
541; SSE41:       # %bb.0:
542; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
543; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
544; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
545; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
546; SSE41-NEXT:    retq
547;
548; VEX-LABEL: sitofp_16i8_to_4f64:
549; VEX:       # %bb.0:
550; VEX-NEXT:    vpmovsxbd %xmm0, %xmm0
551; VEX-NEXT:    vcvtdq2pd %xmm0, %ymm0
552; VEX-NEXT:    retq
553;
554; AVX512-LABEL: sitofp_16i8_to_4f64:
555; AVX512:       # %bb.0:
556; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
557; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
558; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
559; AVX512-NEXT:    retq
560  %cvt = sitofp <16 x i8> %a to <16 x double>
561  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
562  ret <4 x double> %shuf
563}
564
565;
566; Unsigned Integer to Double
567;
568
569define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) {
570; SSE2-LABEL: uitofp_2i64_to_2f64:
571; SSE2:       # %bb.0:
572; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
573; SSE2-NEXT:    pand %xmm0, %xmm1
574; SSE2-NEXT:    por {{.*}}(%rip), %xmm1
575; SSE2-NEXT:    psrlq $32, %xmm0
576; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
577; SSE2-NEXT:    subpd {{.*}}(%rip), %xmm0
578; SSE2-NEXT:    addpd %xmm1, %xmm0
579; SSE2-NEXT:    retq
580;
581; SSE41-LABEL: uitofp_2i64_to_2f64:
582; SSE41:       # %bb.0:
583; SSE41-NEXT:    pxor %xmm1, %xmm1
584; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
585; SSE41-NEXT:    por {{.*}}(%rip), %xmm1
586; SSE41-NEXT:    psrlq $32, %xmm0
587; SSE41-NEXT:    por {{.*}}(%rip), %xmm0
588; SSE41-NEXT:    subpd {{.*}}(%rip), %xmm0
589; SSE41-NEXT:    addpd %xmm1, %xmm0
590; SSE41-NEXT:    retq
591;
592; AVX1-LABEL: uitofp_2i64_to_2f64:
593; AVX1:       # %bb.0:
594; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
595; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
596; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
597; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
598; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
599; AVX1-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
600; AVX1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
601; AVX1-NEXT:    retq
602;
603; AVX2-LABEL: uitofp_2i64_to_2f64:
604; AVX2:       # %bb.0:
605; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
606; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
607; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
608; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
609; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
610; AVX2-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
611; AVX2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
612; AVX2-NEXT:    retq
613;
614; AVX512F-LABEL: uitofp_2i64_to_2f64:
615; AVX512F:       # %bb.0:
616; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
617; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
618; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
619; AVX512F-NEXT:    vpsrlq $32, %xmm0, %xmm0
620; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
621; AVX512F-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
622; AVX512F-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
623; AVX512F-NEXT:    retq
624;
625; AVX512VL-LABEL: uitofp_2i64_to_2f64:
626; AVX512VL:       # %bb.0:
627; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
628; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
629; AVX512VL-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
630; AVX512VL-NEXT:    vpsrlq $32, %xmm0, %xmm0
631; AVX512VL-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
632; AVX512VL-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
633; AVX512VL-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
634; AVX512VL-NEXT:    retq
635;
636; AVX512DQ-LABEL: uitofp_2i64_to_2f64:
637; AVX512DQ:       # %bb.0:
638; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
639; AVX512DQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
640; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
641; AVX512DQ-NEXT:    vzeroupper
642; AVX512DQ-NEXT:    retq
643;
644; AVX512VLDQ-LABEL: uitofp_2i64_to_2f64:
645; AVX512VLDQ:       # %bb.0:
646; AVX512VLDQ-NEXT:    vcvtuqq2pd %xmm0, %xmm0
647; AVX512VLDQ-NEXT:    retq
648  %cvt = uitofp <2 x i64> %a to <2 x double>
649  ret <2 x double> %cvt
650}
651
652define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
653; SSE2-LABEL: uitofp_2i32_to_2f64:
654; SSE2:       # %bb.0:
655; SSE2-NEXT:    xorpd %xmm1, %xmm1
656; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
657; SSE2-NEXT:    movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
658; SSE2-NEXT:    orpd %xmm1, %xmm0
659; SSE2-NEXT:    subpd %xmm1, %xmm0
660; SSE2-NEXT:    retq
661;
662; SSE41-LABEL: uitofp_2i32_to_2f64:
663; SSE41:       # %bb.0:
664; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
665; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
666; SSE41-NEXT:    por %xmm1, %xmm0
667; SSE41-NEXT:    subpd %xmm1, %xmm0
668; SSE41-NEXT:    retq
669;
670; VEX-LABEL: uitofp_2i32_to_2f64:
671; VEX:       # %bb.0:
672; VEX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
673; VEX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
674; VEX-NEXT:    vpor %xmm1, %xmm0, %xmm0
675; VEX-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
676; VEX-NEXT:    retq
677;
678; AVX512F-LABEL: uitofp_2i32_to_2f64:
679; AVX512F:       # %bb.0:
680; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
681; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
682; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
683; AVX512F-NEXT:    vzeroupper
684; AVX512F-NEXT:    retq
685;
686; AVX512VL-LABEL: uitofp_2i32_to_2f64:
687; AVX512VL:       # %bb.0:
688; AVX512VL-NEXT:    vcvtudq2pd %xmm0, %xmm0
689; AVX512VL-NEXT:    retq
690;
691; AVX512DQ-LABEL: uitofp_2i32_to_2f64:
692; AVX512DQ:       # %bb.0:
693; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
694; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
695; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
696; AVX512DQ-NEXT:    vzeroupper
697; AVX512DQ-NEXT:    retq
698;
699; AVX512VLDQ-LABEL: uitofp_2i32_to_2f64:
700; AVX512VLDQ:       # %bb.0:
701; AVX512VLDQ-NEXT:    vcvtudq2pd %xmm0, %xmm0
702; AVX512VLDQ-NEXT:    retq
703  %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
704  %cvt = uitofp <2 x i32> %shuf to <2 x double>
705  ret <2 x double> %cvt
706}
707
708define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
709; SSE2-LABEL: uitofp_4i32_to_2f64:
710; SSE2:       # %bb.0:
711; SSE2-NEXT:    xorpd %xmm1, %xmm1
712; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
713; SSE2-NEXT:    movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
714; SSE2-NEXT:    orpd %xmm1, %xmm0
715; SSE2-NEXT:    subpd %xmm1, %xmm0
716; SSE2-NEXT:    retq
717;
718; SSE41-LABEL: uitofp_4i32_to_2f64:
719; SSE41:       # %bb.0:
720; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
721; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
722; SSE41-NEXT:    por %xmm1, %xmm0
723; SSE41-NEXT:    subpd %xmm1, %xmm0
724; SSE41-NEXT:    retq
725;
726; AVX1-LABEL: uitofp_4i32_to_2f64:
727; AVX1:       # %bb.0:
728; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
729; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
730; AVX1-NEXT:    # xmm1 = mem[0,0]
731; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
732; AVX1-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
733; AVX1-NEXT:    retq
734;
735; AVX2-LABEL: uitofp_4i32_to_2f64:
736; AVX2:       # %bb.0:
737; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
738; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
739; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
740; AVX2-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
741; AVX2-NEXT:    retq
742;
743; AVX512F-LABEL: uitofp_4i32_to_2f64:
744; AVX512F:       # %bb.0:
745; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
746; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
747; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
748; AVX512F-NEXT:    vzeroupper
749; AVX512F-NEXT:    retq
750;
751; AVX512VL-LABEL: uitofp_4i32_to_2f64:
752; AVX512VL:       # %bb.0:
753; AVX512VL-NEXT:    vcvtudq2pd %xmm0, %xmm0
754; AVX512VL-NEXT:    retq
755;
756; AVX512DQ-LABEL: uitofp_4i32_to_2f64:
757; AVX512DQ:       # %bb.0:
758; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
759; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
760; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
761; AVX512DQ-NEXT:    vzeroupper
762; AVX512DQ-NEXT:    retq
763;
764; AVX512VLDQ-LABEL: uitofp_4i32_to_2f64:
765; AVX512VLDQ:       # %bb.0:
766; AVX512VLDQ-NEXT:    vcvtudq2pd %xmm0, %xmm0
767; AVX512VLDQ-NEXT:    retq
768  %cvt = uitofp <4 x i32> %a to <4 x double>
769  %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
770  ret <2 x double> %shuf
771}
772
773define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) {
774; SSE2-LABEL: uitofp_2i16_to_2f64:
775; SSE2:       # %bb.0:
776; SSE2-NEXT:    pxor %xmm1, %xmm1
777; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
778; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
779; SSE2-NEXT:    retq
780;
781; SSE41-LABEL: uitofp_2i16_to_2f64:
782; SSE41:       # %bb.0:
783; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
784; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
785; SSE41-NEXT:    retq
786;
787; AVX-LABEL: uitofp_2i16_to_2f64:
788; AVX:       # %bb.0:
789; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
790; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
791; AVX-NEXT:    retq
792  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
793  %cvt = uitofp <2 x i16> %shuf to <2 x double>
794  ret <2 x double> %cvt
795}
796
797define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) {
798; SSE2-LABEL: uitofp_8i16_to_2f64:
799; SSE2:       # %bb.0:
800; SSE2-NEXT:    pxor %xmm1, %xmm1
801; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
802; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
803; SSE2-NEXT:    retq
804;
805; SSE41-LABEL: uitofp_8i16_to_2f64:
806; SSE41:       # %bb.0:
807; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
808; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
809; SSE41-NEXT:    retq
810;
811; VEX-LABEL: uitofp_8i16_to_2f64:
812; VEX:       # %bb.0:
813; VEX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
814; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
815; VEX-NEXT:    retq
816;
817; AVX512-LABEL: uitofp_8i16_to_2f64:
818; AVX512:       # %bb.0:
819; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
820; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
821; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
822; AVX512-NEXT:    vzeroupper
823; AVX512-NEXT:    retq
824  %cvt = uitofp <8 x i16> %a to <8 x double>
825  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
826  ret <2 x double> %shuf
827}
828
829define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) {
830; SSE2-LABEL: uitofp_2i8_to_2f64:
831; SSE2:       # %bb.0:
832; SSE2-NEXT:    pxor %xmm1, %xmm1
833; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
834; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
835; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
836; SSE2-NEXT:    retq
837;
838; SSE41-LABEL: uitofp_2i8_to_2f64:
839; SSE41:       # %bb.0:
840; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
841; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
842; SSE41-NEXT:    retq
843;
844; AVX-LABEL: uitofp_2i8_to_2f64:
845; AVX:       # %bb.0:
846; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
847; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
848; AVX-NEXT:    retq
849  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
850  %cvt = uitofp <2 x i8> %shuf to <2 x double>
851  ret <2 x double> %cvt
852}
853
854define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
855; SSE2-LABEL: uitofp_16i8_to_2f64:
856; SSE2:       # %bb.0:
857; SSE2-NEXT:    pxor %xmm1, %xmm1
858; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
859; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
860; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
861; SSE2-NEXT:    retq
862;
863; SSE41-LABEL: uitofp_16i8_to_2f64:
864; SSE41:       # %bb.0:
865; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
866; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
867; SSE41-NEXT:    retq
868;
869; VEX-LABEL: uitofp_16i8_to_2f64:
870; VEX:       # %bb.0:
871; VEX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
872; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
873; VEX-NEXT:    retq
874;
875; AVX512-LABEL: uitofp_16i8_to_2f64:
876; AVX512:       # %bb.0:
877; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
878; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
879; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
880; AVX512-NEXT:    vzeroupper
881; AVX512-NEXT:    retq
882  %cvt = uitofp <16 x i8> %a to <16 x double>
883  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
884  ret <2 x double> %shuf
885}
886
887define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
888; SSE2-LABEL: uitofp_4i64_to_4f64:
889; SSE2:       # %bb.0:
890; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
891; SSE2-NEXT:    movdqa %xmm0, %xmm3
892; SSE2-NEXT:    pand %xmm2, %xmm3
893; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
894; SSE2-NEXT:    por %xmm4, %xmm3
895; SSE2-NEXT:    psrlq $32, %xmm0
896; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
897; SSE2-NEXT:    por %xmm5, %xmm0
898; SSE2-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
899; SSE2-NEXT:    subpd %xmm6, %xmm0
900; SSE2-NEXT:    addpd %xmm3, %xmm0
901; SSE2-NEXT:    pand %xmm1, %xmm2
902; SSE2-NEXT:    por %xmm4, %xmm2
903; SSE2-NEXT:    psrlq $32, %xmm1
904; SSE2-NEXT:    por %xmm5, %xmm1
905; SSE2-NEXT:    subpd %xmm6, %xmm1
906; SSE2-NEXT:    addpd %xmm2, %xmm1
907; SSE2-NEXT:    retq
908;
909; SSE41-LABEL: uitofp_4i64_to_4f64:
910; SSE41:       # %bb.0:
911; SSE41-NEXT:    pxor %xmm2, %xmm2
912; SSE41-NEXT:    movdqa %xmm0, %xmm3
913; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
914; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
915; SSE41-NEXT:    por %xmm4, %xmm3
916; SSE41-NEXT:    psrlq $32, %xmm0
917; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
918; SSE41-NEXT:    por %xmm5, %xmm0
919; SSE41-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
920; SSE41-NEXT:    subpd %xmm6, %xmm0
921; SSE41-NEXT:    addpd %xmm3, %xmm0
922; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
923; SSE41-NEXT:    por %xmm4, %xmm2
924; SSE41-NEXT:    psrlq $32, %xmm1
925; SSE41-NEXT:    por %xmm5, %xmm1
926; SSE41-NEXT:    subpd %xmm6, %xmm1
927; SSE41-NEXT:    addpd %xmm2, %xmm1
928; SSE41-NEXT:    retq
929;
930; AVX1-LABEL: uitofp_4i64_to_4f64:
931; AVX1:       # %bb.0:
932; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
933; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
934; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm1, %ymm1
935; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
936; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
937; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
938; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
939; AVX1-NEXT:    vorpd {{.*}}(%rip), %ymm0, %ymm0
940; AVX1-NEXT:    vsubpd {{.*}}(%rip), %ymm0, %ymm0
941; AVX1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
942; AVX1-NEXT:    retq
943;
944; AVX2-LABEL: uitofp_4i64_to_4f64:
945; AVX2:       # %bb.0:
946; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
947; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
948; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
949; AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
950; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
951; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
952; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
953; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
954; AVX2-NEXT:    vsubpd %ymm2, %ymm0, %ymm0
955; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
956; AVX2-NEXT:    retq
957;
958; AVX512F-LABEL: uitofp_4i64_to_4f64:
959; AVX512F:       # %bb.0:
960; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
961; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
962; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
963; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
964; AVX512F-NEXT:    vpsrlq $32, %ymm0, %ymm0
965; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
966; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
967; AVX512F-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
968; AVX512F-NEXT:    vsubpd %ymm2, %ymm0, %ymm0
969; AVX512F-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
970; AVX512F-NEXT:    retq
971;
972; AVX512VL-LABEL: uitofp_4i64_to_4f64:
973; AVX512VL:       # %bb.0:
974; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
975; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
976; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1
977; AVX512VL-NEXT:    vpsrlq $32, %ymm0, %ymm0
978; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0
979; AVX512VL-NEXT:    vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
980; AVX512VL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
981; AVX512VL-NEXT:    retq
982;
983; AVX512DQ-LABEL: uitofp_4i64_to_4f64:
984; AVX512DQ:       # %bb.0:
985; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
986; AVX512DQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
987; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
988; AVX512DQ-NEXT:    retq
989;
990; AVX512VLDQ-LABEL: uitofp_4i64_to_4f64:
991; AVX512VLDQ:       # %bb.0:
992; AVX512VLDQ-NEXT:    vcvtuqq2pd %ymm0, %ymm0
993; AVX512VLDQ-NEXT:    retq
994  %cvt = uitofp <4 x i64> %a to <4 x double>
995  ret <4 x double> %cvt
996}
997
998define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
999; SSE2-LABEL: uitofp_4i32_to_4f64:
1000; SSE2:       # %bb.0:
1001; SSE2-NEXT:    movapd %xmm0, %xmm1
1002; SSE2-NEXT:    xorpd %xmm2, %xmm2
1003; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1004; SSE2-NEXT:    movapd {{.*#+}} xmm3 = [4.503599627370496E+15,4.503599627370496E+15]
1005; SSE2-NEXT:    orpd %xmm3, %xmm0
1006; SSE2-NEXT:    subpd %xmm3, %xmm0
1007; SSE2-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1008; SSE2-NEXT:    orpd %xmm3, %xmm1
1009; SSE2-NEXT:    subpd %xmm3, %xmm1
1010; SSE2-NEXT:    retq
1011;
1012; SSE41-LABEL: uitofp_4i32_to_4f64:
1013; SSE41:       # %bb.0:
1014; SSE41-NEXT:    movdqa %xmm0, %xmm1
1015; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1016; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15]
1017; SSE41-NEXT:    por %xmm2, %xmm0
1018; SSE41-NEXT:    subpd %xmm2, %xmm0
1019; SSE41-NEXT:    pxor %xmm3, %xmm3
1020; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
1021; SSE41-NEXT:    por %xmm2, %xmm1
1022; SSE41-NEXT:    subpd %xmm2, %xmm1
1023; SSE41-NEXT:    retq
1024;
1025; AVX1-LABEL: uitofp_4i32_to_4f64:
1026; AVX1:       # %bb.0:
1027; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1028; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1029; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1030; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1031; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15]
1032; AVX1-NEXT:    vorpd %ymm1, %ymm0, %ymm0
1033; AVX1-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
1034; AVX1-NEXT:    retq
1035;
1036; AVX2-LABEL: uitofp_4i32_to_4f64:
1037; AVX2:       # %bb.0:
1038; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1039; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15]
1040; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1041; AVX2-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
1042; AVX2-NEXT:    retq
1043;
1044; AVX512F-LABEL: uitofp_4i32_to_4f64:
1045; AVX512F:       # %bb.0:
1046; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1047; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
1048; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1049; AVX512F-NEXT:    retq
1050;
1051; AVX512VL-LABEL: uitofp_4i32_to_4f64:
1052; AVX512VL:       # %bb.0:
1053; AVX512VL-NEXT:    vcvtudq2pd %xmm0, %ymm0
1054; AVX512VL-NEXT:    retq
1055;
1056; AVX512DQ-LABEL: uitofp_4i32_to_4f64:
1057; AVX512DQ:       # %bb.0:
1058; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1059; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
1060; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1061; AVX512DQ-NEXT:    retq
1062;
1063; AVX512VLDQ-LABEL: uitofp_4i32_to_4f64:
1064; AVX512VLDQ:       # %bb.0:
1065; AVX512VLDQ-NEXT:    vcvtudq2pd %xmm0, %ymm0
1066; AVX512VLDQ-NEXT:    retq
1067  %cvt = uitofp <4 x i32> %a to <4 x double>
1068  ret <4 x double> %cvt
1069}
1070
1071define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) {
1072; SSE2-LABEL: uitofp_4i16_to_4f64:
1073; SSE2:       # %bb.0:
1074; SSE2-NEXT:    pxor %xmm1, %xmm1
1075; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1076; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm2
1077; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1078; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm1
1079; SSE2-NEXT:    movaps %xmm2, %xmm0
1080; SSE2-NEXT:    retq
1081;
1082; SSE41-LABEL: uitofp_4i16_to_4f64:
1083; SSE41:       # %bb.0:
1084; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1085; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
1086; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1087; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
1088; SSE41-NEXT:    retq
1089;
1090; AVX-LABEL: uitofp_4i16_to_4f64:
1091; AVX:       # %bb.0:
1092; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1093; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
1094; AVX-NEXT:    retq
1095  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1096  %cvt = uitofp <4 x i16> %shuf to <4 x double>
1097  ret <4 x double> %cvt
1098}
1099
1100define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) {
1101; SSE2-LABEL: uitofp_8i16_to_4f64:
1102; SSE2:       # %bb.0:
1103; SSE2-NEXT:    pxor %xmm1, %xmm1
1104; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1105; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm2
1106; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1107; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm1
1108; SSE2-NEXT:    movaps %xmm2, %xmm0
1109; SSE2-NEXT:    retq
1110;
1111; SSE41-LABEL: uitofp_8i16_to_4f64:
1112; SSE41:       # %bb.0:
1113; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1114; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
1115; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1116; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
1117; SSE41-NEXT:    retq
1118;
1119; VEX-LABEL: uitofp_8i16_to_4f64:
1120; VEX:       # %bb.0:
1121; VEX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1122; VEX-NEXT:    vcvtdq2pd %xmm0, %ymm0
1123; VEX-NEXT:    retq
1124;
1125; AVX512-LABEL: uitofp_8i16_to_4f64:
1126; AVX512:       # %bb.0:
1127; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1128; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
1129; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1130; AVX512-NEXT:    retq
1131  %cvt = uitofp <8 x i16> %a to <8 x double>
1132  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1133  ret <4 x double> %shuf
1134}
1135
1136define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) {
1137; SSE2-LABEL: uitofp_4i8_to_4f64:
1138; SSE2:       # %bb.0:
1139; SSE2-NEXT:    pxor %xmm1, %xmm1
1140; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1141; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1142; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm2
1143; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1144; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm1
1145; SSE2-NEXT:    movaps %xmm2, %xmm0
1146; SSE2-NEXT:    retq
1147;
1148; SSE41-LABEL: uitofp_4i8_to_4f64:
1149; SSE41:       # %bb.0:
1150; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1151; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
1152; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1153; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
1154; SSE41-NEXT:    retq
1155;
1156; AVX-LABEL: uitofp_4i8_to_4f64:
1157; AVX:       # %bb.0:
1158; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1159; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
1160; AVX-NEXT:    retq
1161  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1162  %cvt = uitofp <4 x i8> %shuf to <4 x double>
1163  ret <4 x double> %cvt
1164}
1165
1166define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
1167; SSE2-LABEL: uitofp_16i8_to_4f64:
1168; SSE2:       # %bb.0:
1169; SSE2-NEXT:    pxor %xmm1, %xmm1
1170; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1171; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1172; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm2
1173; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1174; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm1
1175; SSE2-NEXT:    movaps %xmm2, %xmm0
1176; SSE2-NEXT:    retq
1177;
1178; SSE41-LABEL: uitofp_16i8_to_4f64:
1179; SSE41:       # %bb.0:
1180; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1181; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
1182; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1183; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
1184; SSE41-NEXT:    retq
1185;
1186; VEX-LABEL: uitofp_16i8_to_4f64:
1187; VEX:       # %bb.0:
1188; VEX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1189; VEX-NEXT:    vcvtdq2pd %xmm0, %ymm0
1190; VEX-NEXT:    retq
1191;
1192; AVX512-LABEL: uitofp_16i8_to_4f64:
1193; AVX512:       # %bb.0:
1194; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1195; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
1196; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1197; AVX512-NEXT:    retq
1198  %cvt = uitofp <16 x i8> %a to <16 x double>
1199  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1200  ret <4 x double> %shuf
1201}
1202
1203;
1204; Signed Integer to Float
1205;
1206
1207define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
1208; SSE2-LABEL: sitofp_2i64_to_4f32:
1209; SSE2:       # %bb.0:
1210; SSE2-NEXT:    movq %xmm0, %rax
1211; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
1212; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1213; SSE2-NEXT:    movq %xmm0, %rax
1214; SSE2-NEXT:    xorps %xmm0, %xmm0
1215; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
1216; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1217; SSE2-NEXT:    movaps %xmm1, %xmm0
1218; SSE2-NEXT:    retq
1219;
1220; SSE41-LABEL: sitofp_2i64_to_4f32:
1221; SSE41:       # %bb.0:
1222; SSE41-NEXT:    pextrq $1, %xmm0, %rax
1223; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
1224; SSE41-NEXT:    movq %xmm0, %rax
1225; SSE41-NEXT:    xorps %xmm0, %xmm0
1226; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
1227; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
1228; SSE41-NEXT:    retq
1229;
1230; VEX-LABEL: sitofp_2i64_to_4f32:
1231; VEX:       # %bb.0:
1232; VEX-NEXT:    vpextrq $1, %xmm0, %rax
1233; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
1234; VEX-NEXT:    vmovq %xmm0, %rax
1235; VEX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
1236; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1237; VEX-NEXT:    retq
1238;
1239; AVX512F-LABEL: sitofp_2i64_to_4f32:
1240; AVX512F:       # %bb.0:
1241; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
1242; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
1243; AVX512F-NEXT:    vmovq %xmm0, %rax
1244; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
1245; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1246; AVX512F-NEXT:    retq
1247;
1248; AVX512VL-LABEL: sitofp_2i64_to_4f32:
1249; AVX512VL:       # %bb.0:
1250; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
1251; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
1252; AVX512VL-NEXT:    vmovq %xmm0, %rax
1253; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
1254; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1255; AVX512VL-NEXT:    retq
1256;
1257; AVX512DQ-LABEL: sitofp_2i64_to_4f32:
1258; AVX512DQ:       # %bb.0:
1259; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1260; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
1261; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1262; AVX512DQ-NEXT:    vzeroupper
1263; AVX512DQ-NEXT:    retq
1264;
1265; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32:
1266; AVX512VLDQ:       # %bb.0:
1267; AVX512VLDQ-NEXT:    vcvtqq2ps %xmm0, %xmm0
1268; AVX512VLDQ-NEXT:    retq
1269  %cvt = sitofp <2 x i64> %a to <2 x float>
1270  %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1271  ret <4 x float> %ext
1272}
1273
1274define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) {
1275; SSE2-LABEL: sitofp_2i64_to_4f32_zero:
1276; SSE2:       # %bb.0:
1277; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1278; SSE2-NEXT:    movq %xmm1, %rax
1279; SSE2-NEXT:    xorps %xmm1, %xmm1
1280; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
1281; SSE2-NEXT:    movq %xmm0, %rax
1282; SSE2-NEXT:    xorps %xmm0, %xmm0
1283; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
1284; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1285; SSE2-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
1286; SSE2-NEXT:    retq
1287;
1288; SSE41-LABEL: sitofp_2i64_to_4f32_zero:
1289; SSE41:       # %bb.0:
1290; SSE41-NEXT:    movq %xmm0, %rax
1291; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
1292; SSE41-NEXT:    pextrq $1, %xmm0, %rax
1293; SSE41-NEXT:    xorps %xmm0, %xmm0
1294; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
1295; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],zero,zero
1296; SSE41-NEXT:    movaps %xmm1, %xmm0
1297; SSE41-NEXT:    retq
1298;
1299; VEX-LABEL: sitofp_2i64_to_4f32_zero:
1300; VEX:       # %bb.0:
1301; VEX-NEXT:    vmovq %xmm0, %rax
1302; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
1303; VEX-NEXT:    vpextrq $1, %xmm0, %rax
1304; VEX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
1305; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
1306; VEX-NEXT:    retq
1307;
1308; AVX512F-LABEL: sitofp_2i64_to_4f32_zero:
1309; AVX512F:       # %bb.0:
1310; AVX512F-NEXT:    vmovq %xmm0, %rax
1311; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
1312; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
1313; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
1314; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
1315; AVX512F-NEXT:    retq
1316;
1317; AVX512VL-LABEL: sitofp_2i64_to_4f32_zero:
1318; AVX512VL:       # %bb.0:
1319; AVX512VL-NEXT:    vmovq %xmm0, %rax
1320; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
1321; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
1322; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
1323; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
1324; AVX512VL-NEXT:    retq
1325;
1326; AVX512DQ-LABEL: sitofp_2i64_to_4f32_zero:
1327; AVX512DQ:       # %bb.0:
1328; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1329; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
1330; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
1331; AVX512DQ-NEXT:    vzeroupper
1332; AVX512DQ-NEXT:    retq
1333;
1334; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32_zero:
1335; AVX512VLDQ:       # %bb.0:
1336; AVX512VLDQ-NEXT:    vcvtqq2ps %xmm0, %xmm0
1337; AVX512VLDQ-NEXT:    retq
1338  %cvt = sitofp <2 x i64> %a to <2 x float>
1339  %ext = shufflevector <2 x float> %cvt, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1340  ret <4 x float> %ext
1341}
1342
1343define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
1344; SSE2-LABEL: sitofp_4i64_to_4f32_undef:
1345; SSE2:       # %bb.0:
1346; SSE2-NEXT:    movq %xmm0, %rax
1347; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
1348; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1349; SSE2-NEXT:    movq %xmm0, %rax
1350; SSE2-NEXT:    xorps %xmm0, %xmm0
1351; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
1352; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1353; SSE2-NEXT:    movq {{.*#+}} xmm0 = xmm1[0],zero
1354; SSE2-NEXT:    retq
1355;
1356; SSE41-LABEL: sitofp_4i64_to_4f32_undef:
1357; SSE41:       # %bb.0:
1358; SSE41-NEXT:    pextrq $1, %xmm0, %rax
1359; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
1360; SSE41-NEXT:    movq %xmm0, %rax
1361; SSE41-NEXT:    xorps %xmm0, %xmm0
1362; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
1363; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1364; SSE41-NEXT:    retq
1365;
1366; VEX-LABEL: sitofp_4i64_to_4f32_undef:
1367; VEX:       # %bb.0:
1368; VEX-NEXT:    vpextrq $1, %xmm0, %rax
1369; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
1370; VEX-NEXT:    vmovq %xmm0, %rax
1371; VEX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
1372; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1373; VEX-NEXT:    retq
1374;
1375; AVX512F-LABEL: sitofp_4i64_to_4f32_undef:
1376; AVX512F:       # %bb.0:
1377; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
1378; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
1379; AVX512F-NEXT:    vmovq %xmm0, %rax
1380; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
1381; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1382; AVX512F-NEXT:    retq
1383;
1384; AVX512VL-LABEL: sitofp_4i64_to_4f32_undef:
1385; AVX512VL:       # %bb.0:
1386; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
1387; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
1388; AVX512VL-NEXT:    vmovq %xmm0, %rax
1389; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
1390; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1391; AVX512VL-NEXT:    retq
1392;
1393; AVX512DQ-LABEL: sitofp_4i64_to_4f32_undef:
1394; AVX512DQ:       # %bb.0:
1395; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1396; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
1397; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1398; AVX512DQ-NEXT:    vzeroupper
1399; AVX512DQ-NEXT:    retq
1400;
1401; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32_undef:
1402; AVX512VLDQ:       # %bb.0:
1403; AVX512VLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1404; AVX512VLDQ-NEXT:    vcvtqq2ps %ymm0, %xmm0
1405; AVX512VLDQ-NEXT:    vzeroupper
1406; AVX512VLDQ-NEXT:    retq
1407  %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1408  %cvt = sitofp <4 x i64> %ext to <4 x float>
1409  ret <4 x float> %cvt
1410}
1411
1412define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) {
1413; SSE-LABEL: sitofp_4i32_to_4f32:
1414; SSE:       # %bb.0:
1415; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
1416; SSE-NEXT:    retq
1417;
1418; AVX-LABEL: sitofp_4i32_to_4f32:
1419; AVX:       # %bb.0:
1420; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
1421; AVX-NEXT:    retq
1422  %cvt = sitofp <4 x i32> %a to <4 x float>
1423  ret <4 x float> %cvt
1424}
1425
1426define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) {
1427; SSE2-LABEL: sitofp_4i16_to_4f32:
1428; SSE2:       # %bb.0:
1429; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1430; SSE2-NEXT:    psrad $16, %xmm0
1431; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
1432; SSE2-NEXT:    retq
1433;
1434; SSE41-LABEL: sitofp_4i16_to_4f32:
1435; SSE41:       # %bb.0:
1436; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
1437; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
1438; SSE41-NEXT:    retq
1439;
1440; AVX-LABEL: sitofp_4i16_to_4f32:
1441; AVX:       # %bb.0:
1442; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
1443; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
1444; AVX-NEXT:    retq
1445  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1446  %cvt = sitofp <4 x i16> %shuf to <4 x float>
1447  ret <4 x float> %cvt
1448}
1449
1450define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) {
1451; SSE2-LABEL: sitofp_8i16_to_4f32:
1452; SSE2:       # %bb.0:
1453; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1454; SSE2-NEXT:    psrad $16, %xmm0
1455; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
1456; SSE2-NEXT:    retq
1457;
1458; SSE41-LABEL: sitofp_8i16_to_4f32:
1459; SSE41:       # %bb.0:
1460; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
1461; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
1462; SSE41-NEXT:    retq
1463;
1464; AVX1-LABEL: sitofp_8i16_to_4f32:
1465; AVX1:       # %bb.0:
1466; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
1467; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1468; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
1469; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1470; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1471; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1472; AVX1-NEXT:    vzeroupper
1473; AVX1-NEXT:    retq
1474;
1475; AVX2-LABEL: sitofp_8i16_to_4f32:
1476; AVX2:       # %bb.0:
1477; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
1478; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1479; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1480; AVX2-NEXT:    vzeroupper
1481; AVX2-NEXT:    retq
1482;
1483; AVX512-LABEL: sitofp_8i16_to_4f32:
1484; AVX512:       # %bb.0:
1485; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
1486; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
1487; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1488; AVX512-NEXT:    vzeroupper
1489; AVX512-NEXT:    retq
1490  %cvt = sitofp <8 x i16> %a to <8 x float>
1491  %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1492  ret <4 x float> %shuf
1493}
1494
1495define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) {
1496; SSE2-LABEL: sitofp_4i8_to_4f32:
1497; SSE2:       # %bb.0:
1498; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1499; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1500; SSE2-NEXT:    psrad $24, %xmm0
1501; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
1502; SSE2-NEXT:    retq
1503;
1504; SSE41-LABEL: sitofp_4i8_to_4f32:
1505; SSE41:       # %bb.0:
1506; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
1507; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
1508; SSE41-NEXT:    retq
1509;
1510; AVX-LABEL: sitofp_4i8_to_4f32:
1511; AVX:       # %bb.0:
1512; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
1513; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
1514; AVX-NEXT:    retq
1515  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1516  %cvt = sitofp <4 x i8> %shuf to <4 x float>
1517  ret <4 x float> %cvt
1518}
1519
1520define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
1521; SSE2-LABEL: sitofp_16i8_to_4f32:
1522; SSE2:       # %bb.0:
1523; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1524; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1525; SSE2-NEXT:    psrad $24, %xmm0
1526; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
1527; SSE2-NEXT:    retq
1528;
1529; SSE41-LABEL: sitofp_16i8_to_4f32:
1530; SSE41:       # %bb.0:
1531; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
1532; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
1533; SSE41-NEXT:    retq
1534;
1535; AVX1-LABEL: sitofp_16i8_to_4f32:
1536; AVX1:       # %bb.0:
1537; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
1538; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1539; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
1540; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1541; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1542; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1543; AVX1-NEXT:    vzeroupper
1544; AVX1-NEXT:    retq
1545;
1546; AVX2-LABEL: sitofp_16i8_to_4f32:
1547; AVX2:       # %bb.0:
1548; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
1549; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1550; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1551; AVX2-NEXT:    vzeroupper
1552; AVX2-NEXT:    retq
1553;
1554; AVX512-LABEL: sitofp_16i8_to_4f32:
1555; AVX512:       # %bb.0:
1556; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
1557; AVX512-NEXT:    vcvtdq2ps %zmm0, %zmm0
1558; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1559; AVX512-NEXT:    vzeroupper
1560; AVX512-NEXT:    retq
1561  %cvt = sitofp <16 x i8> %a to <16 x float>
1562  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1563  ret <4 x float> %shuf
1564}
1565
1566define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
1567; SSE2-LABEL: sitofp_4i64_to_4f32:
1568; SSE2:       # %bb.0:
1569; SSE2-NEXT:    movq %xmm1, %rax
1570; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
1571; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1572; SSE2-NEXT:    movq %xmm1, %rax
1573; SSE2-NEXT:    xorps %xmm1, %xmm1
1574; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
1575; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1576; SSE2-NEXT:    movq %xmm0, %rax
1577; SSE2-NEXT:    xorps %xmm1, %xmm1
1578; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
1579; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1580; SSE2-NEXT:    movq %xmm0, %rax
1581; SSE2-NEXT:    xorps %xmm0, %xmm0
1582; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
1583; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1584; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1585; SSE2-NEXT:    movaps %xmm1, %xmm0
1586; SSE2-NEXT:    retq
1587;
1588; SSE41-LABEL: sitofp_4i64_to_4f32:
1589; SSE41:       # %bb.0:
1590; SSE41-NEXT:    pextrq $1, %xmm0, %rax
1591; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
1592; SSE41-NEXT:    movq %xmm0, %rax
1593; SSE41-NEXT:    xorps %xmm0, %xmm0
1594; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
1595; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
1596; SSE41-NEXT:    movq %xmm1, %rax
1597; SSE41-NEXT:    xorps %xmm2, %xmm2
1598; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
1599; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
1600; SSE41-NEXT:    pextrq $1, %xmm1, %rax
1601; SSE41-NEXT:    xorps %xmm1, %xmm1
1602; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
1603; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
1604; SSE41-NEXT:    retq
1605;
1606; AVX1-LABEL: sitofp_4i64_to_4f32:
1607; AVX1:       # %bb.0:
1608; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
1609; AVX1-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
1610; AVX1-NEXT:    vmovq %xmm0, %rax
1611; AVX1-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
1612; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1613; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1614; AVX1-NEXT:    vmovq %xmm0, %rax
1615; AVX1-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
1616; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1617; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
1618; AVX1-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
1619; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1620; AVX1-NEXT:    vzeroupper
1621; AVX1-NEXT:    retq
1622;
1623; AVX2-LABEL: sitofp_4i64_to_4f32:
1624; AVX2:       # %bb.0:
1625; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
1626; AVX2-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
1627; AVX2-NEXT:    vmovq %xmm0, %rax
1628; AVX2-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
1629; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1630; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1631; AVX2-NEXT:    vmovq %xmm0, %rax
1632; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
1633; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1634; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
1635; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
1636; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1637; AVX2-NEXT:    vzeroupper
1638; AVX2-NEXT:    retq
1639;
1640; AVX512F-LABEL: sitofp_4i64_to_4f32:
1641; AVX512F:       # %bb.0:
1642; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
1643; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
1644; AVX512F-NEXT:    vmovq %xmm0, %rax
1645; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
1646; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1647; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
1648; AVX512F-NEXT:    vmovq %xmm0, %rax
1649; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
1650; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1651; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
1652; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
1653; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1654; AVX512F-NEXT:    vzeroupper
1655; AVX512F-NEXT:    retq
1656;
1657; AVX512VL-LABEL: sitofp_4i64_to_4f32:
1658; AVX512VL:       # %bb.0:
1659; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
1660; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
1661; AVX512VL-NEXT:    vmovq %xmm0, %rax
1662; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
1663; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1664; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
1665; AVX512VL-NEXT:    vmovq %xmm0, %rax
1666; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
1667; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1668; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
1669; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
1670; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1671; AVX512VL-NEXT:    vzeroupper
1672; AVX512VL-NEXT:    retq
1673;
1674; AVX512DQ-LABEL: sitofp_4i64_to_4f32:
1675; AVX512DQ:       # %bb.0:
1676; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1677; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
1678; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1679; AVX512DQ-NEXT:    vzeroupper
1680; AVX512DQ-NEXT:    retq
1681;
1682; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32:
1683; AVX512VLDQ:       # %bb.0:
1684; AVX512VLDQ-NEXT:    vcvtqq2ps %ymm0, %xmm0
1685; AVX512VLDQ-NEXT:    vzeroupper
1686; AVX512VLDQ-NEXT:    retq
1687  %cvt = sitofp <4 x i64> %a to <4 x float>
1688  ret <4 x float> %cvt
1689}
1690
1691define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) {
1692; SSE-LABEL: sitofp_8i32_to_8f32:
1693; SSE:       # %bb.0:
1694; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
1695; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
1696; SSE-NEXT:    retq
1697;
1698; AVX-LABEL: sitofp_8i32_to_8f32:
1699; AVX:       # %bb.0:
1700; AVX-NEXT:    vcvtdq2ps %ymm0, %ymm0
1701; AVX-NEXT:    retq
1702  %cvt = sitofp <8 x i32> %a to <8 x float>
1703  ret <8 x float> %cvt
1704}
1705
1706define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) {
1707; SSE2-LABEL: sitofp_8i16_to_8f32:
1708; SSE2:       # %bb.0:
1709; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1710; SSE2-NEXT:    psrad $16, %xmm1
1711; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm2
1712; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1713; SSE2-NEXT:    psrad $16, %xmm0
1714; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
1715; SSE2-NEXT:    movaps %xmm2, %xmm0
1716; SSE2-NEXT:    retq
1717;
1718; SSE41-LABEL: sitofp_8i16_to_8f32:
1719; SSE41:       # %bb.0:
1720; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
1721; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm2
1722; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1723; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
1724; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm1
1725; SSE41-NEXT:    movaps %xmm2, %xmm0
1726; SSE41-NEXT:    retq
1727;
1728; AVX1-LABEL: sitofp_8i16_to_8f32:
1729; AVX1:       # %bb.0:
1730; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
1731; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1732; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
1733; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1734; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1735; AVX1-NEXT:    retq
1736;
1737; AVX2-LABEL: sitofp_8i16_to_8f32:
1738; AVX2:       # %bb.0:
1739; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
1740; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1741; AVX2-NEXT:    retq
1742;
1743; AVX512-LABEL: sitofp_8i16_to_8f32:
1744; AVX512:       # %bb.0:
1745; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
1746; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
1747; AVX512-NEXT:    retq
1748  %cvt = sitofp <8 x i16> %a to <8 x float>
1749  ret <8 x float> %cvt
1750}
1751
1752define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
1753; SSE2-LABEL: sitofp_8i8_to_8f32:
1754; SSE2:       # %bb.0:
1755; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1756; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1757; SSE2-NEXT:    psrad $24, %xmm0
1758; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
1759; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1760; SSE2-NEXT:    psrad $24, %xmm1
1761; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm1
1762; SSE2-NEXT:    retq
1763;
1764; SSE41-LABEL: sitofp_8i8_to_8f32:
1765; SSE41:       # %bb.0:
1766; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
1767; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm2
1768; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1769; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
1770; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm1
1771; SSE41-NEXT:    movaps %xmm2, %xmm0
1772; SSE41-NEXT:    retq
1773;
1774; AVX1-LABEL: sitofp_8i8_to_8f32:
1775; AVX1:       # %bb.0:
1776; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
1777; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1778; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
1779; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1780; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1781; AVX1-NEXT:    retq
1782;
1783; AVX2-LABEL: sitofp_8i8_to_8f32:
1784; AVX2:       # %bb.0:
1785; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
1786; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1787; AVX2-NEXT:    retq
1788;
1789; AVX512-LABEL: sitofp_8i8_to_8f32:
1790; AVX512:       # %bb.0:
1791; AVX512-NEXT:    vpmovsxbd %xmm0, %ymm0
1792; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
1793; AVX512-NEXT:    retq
1794  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1795  %cvt = sitofp <8 x i8> %shuf to <8 x float>
1796  ret <8 x float> %cvt
1797}
1798
1799define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
1800; SSE2-LABEL: sitofp_16i8_to_8f32:
1801; SSE2:       # %bb.0:
1802; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1803; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1804; SSE2-NEXT:    psrad $24, %xmm0
1805; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
1806; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1807; SSE2-NEXT:    psrad $24, %xmm1
1808; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm1
1809; SSE2-NEXT:    retq
1810;
1811; SSE41-LABEL: sitofp_16i8_to_8f32:
1812; SSE41:       # %bb.0:
1813; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
1814; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm2
1815; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1816; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
1817; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm1
1818; SSE41-NEXT:    movaps %xmm2, %xmm0
1819; SSE41-NEXT:    retq
1820;
1821; AVX1-LABEL: sitofp_16i8_to_8f32:
1822; AVX1:       # %bb.0:
1823; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
1824; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1825; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
1826; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1827; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1828; AVX1-NEXT:    retq
1829;
1830; AVX2-LABEL: sitofp_16i8_to_8f32:
1831; AVX2:       # %bb.0:
1832; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
1833; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1834; AVX2-NEXT:    retq
1835;
1836; AVX512-LABEL: sitofp_16i8_to_8f32:
1837; AVX512:       # %bb.0:
1838; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
1839; AVX512-NEXT:    vcvtdq2ps %zmm0, %zmm0
1840; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1841; AVX512-NEXT:    retq
1842  %cvt = sitofp <16 x i8> %a to <16 x float>
1843  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1844  ret <8 x float> %shuf
1845}
1846
1847;
1848; Unsigned Integer to Float
1849;
1850
1851define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
1852; SSE2-LABEL: uitofp_2i64_to_4f32:
1853; SSE2:       # %bb.0:
1854; SSE2-NEXT:    movdqa %xmm0, %xmm1
1855; SSE2-NEXT:    movq %xmm0, %rax
1856; SSE2-NEXT:    testq %rax, %rax
1857; SSE2-NEXT:    js .LBB41_1
1858; SSE2-NEXT:  # %bb.2:
1859; SSE2-NEXT:    xorps %xmm0, %xmm0
1860; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
1861; SSE2-NEXT:    jmp .LBB41_3
1862; SSE2-NEXT:  .LBB41_1:
1863; SSE2-NEXT:    movq %rax, %rcx
1864; SSE2-NEXT:    shrq %rcx
1865; SSE2-NEXT:    andl $1, %eax
1866; SSE2-NEXT:    orq %rcx, %rax
1867; SSE2-NEXT:    xorps %xmm0, %xmm0
1868; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
1869; SSE2-NEXT:    addss %xmm0, %xmm0
1870; SSE2-NEXT:  .LBB41_3:
1871; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1872; SSE2-NEXT:    movq %xmm1, %rax
1873; SSE2-NEXT:    testq %rax, %rax
1874; SSE2-NEXT:    js .LBB41_4
1875; SSE2-NEXT:  # %bb.5:
1876; SSE2-NEXT:    xorps %xmm1, %xmm1
1877; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
1878; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1879; SSE2-NEXT:    retq
1880; SSE2-NEXT:  .LBB41_4:
1881; SSE2-NEXT:    movq %rax, %rcx
1882; SSE2-NEXT:    shrq %rcx
1883; SSE2-NEXT:    andl $1, %eax
1884; SSE2-NEXT:    orq %rcx, %rax
1885; SSE2-NEXT:    xorps %xmm1, %xmm1
1886; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
1887; SSE2-NEXT:    addss %xmm1, %xmm1
1888; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1889; SSE2-NEXT:    retq
1890;
1891; SSE41-LABEL: uitofp_2i64_to_4f32:
1892; SSE41:       # %bb.0:
1893; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,1]
1894; SSE41-NEXT:    pand %xmm0, %xmm1
1895; SSE41-NEXT:    movdqa %xmm0, %xmm2
1896; SSE41-NEXT:    pxor %xmm3, %xmm3
1897; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
1898; SSE41-NEXT:    movdqa %xmm0, %xmm4
1899; SSE41-NEXT:    psrlq $1, %xmm4
1900; SSE41-NEXT:    por %xmm1, %xmm4
1901; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
1902; SSE41-NEXT:    pextrq $1, %xmm2, %rax
1903; SSE41-NEXT:    xorps %xmm0, %xmm0
1904; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
1905; SSE41-NEXT:    movq %xmm2, %rax
1906; SSE41-NEXT:    xorps %xmm1, %xmm1
1907; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
1908; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],zero,zero
1909; SSE41-NEXT:    movaps %xmm1, %xmm2
1910; SSE41-NEXT:    addps %xmm1, %xmm2
1911; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
1912; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm1
1913; SSE41-NEXT:    movaps %xmm1, %xmm0
1914; SSE41-NEXT:    retq
1915;
1916; VEX-LABEL: uitofp_2i64_to_4f32:
1917; VEX:       # %bb.0:
1918; VEX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
1919; VEX-NEXT:    vpsrlq $1, %xmm0, %xmm2
1920; VEX-NEXT:    vpor %xmm1, %xmm2, %xmm1
1921; VEX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm1
1922; VEX-NEXT:    vpextrq $1, %xmm1, %rax
1923; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
1924; VEX-NEXT:    vmovq %xmm1, %rax
1925; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
1926; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
1927; VEX-NEXT:    vaddps %xmm1, %xmm1, %xmm2
1928; VEX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1929; VEX-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0
1930; VEX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
1931; VEX-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
1932; VEX-NEXT:    retq
1933;
1934; AVX512F-LABEL: uitofp_2i64_to_4f32:
1935; AVX512F:       # %bb.0:
1936; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
1937; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
1938; AVX512F-NEXT:    vmovq %xmm0, %rax
1939; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
1940; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1941; AVX512F-NEXT:    retq
1942;
1943; AVX512VL-LABEL: uitofp_2i64_to_4f32:
1944; AVX512VL:       # %bb.0:
1945; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
1946; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
1947; AVX512VL-NEXT:    vmovq %xmm0, %rax
1948; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
1949; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1950; AVX512VL-NEXT:    retq
1951;
1952; AVX512DQ-LABEL: uitofp_2i64_to_4f32:
1953; AVX512DQ:       # %bb.0:
1954; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1955; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
1956; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1957; AVX512DQ-NEXT:    vzeroupper
1958; AVX512DQ-NEXT:    retq
1959;
1960; AVX512VLDQ-LABEL: uitofp_2i64_to_4f32:
1961; AVX512VLDQ:       # %bb.0:
1962; AVX512VLDQ-NEXT:    vcvtuqq2ps %xmm0, %xmm0
1963; AVX512VLDQ-NEXT:    retq
1964  %cvt = uitofp <2 x i64> %a to <2 x float>
1965  %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1966  ret <4 x float> %ext
1967}
1968
1969define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
1970; SSE2-LABEL: uitofp_2i64_to_2f32:
1971; SSE2:       # %bb.0:
1972; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1973; SSE2-NEXT:    movq %xmm1, %rax
1974; SSE2-NEXT:    testq %rax, %rax
1975; SSE2-NEXT:    js .LBB42_1
1976; SSE2-NEXT:  # %bb.2:
1977; SSE2-NEXT:    xorps %xmm1, %xmm1
1978; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
1979; SSE2-NEXT:    jmp .LBB42_3
1980; SSE2-NEXT:  .LBB42_1:
1981; SSE2-NEXT:    movq %rax, %rcx
1982; SSE2-NEXT:    shrq %rcx
1983; SSE2-NEXT:    andl $1, %eax
1984; SSE2-NEXT:    orq %rcx, %rax
1985; SSE2-NEXT:    xorps %xmm1, %xmm1
1986; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
1987; SSE2-NEXT:    addss %xmm1, %xmm1
1988; SSE2-NEXT:  .LBB42_3:
1989; SSE2-NEXT:    movq %xmm0, %rax
1990; SSE2-NEXT:    testq %rax, %rax
1991; SSE2-NEXT:    js .LBB42_4
1992; SSE2-NEXT:  # %bb.5:
1993; SSE2-NEXT:    xorps %xmm0, %xmm0
1994; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
1995; SSE2-NEXT:    jmp .LBB42_6
1996; SSE2-NEXT:  .LBB42_4:
1997; SSE2-NEXT:    movq %rax, %rcx
1998; SSE2-NEXT:    shrq %rcx
1999; SSE2-NEXT:    andl $1, %eax
2000; SSE2-NEXT:    orq %rcx, %rax
2001; SSE2-NEXT:    xorps %xmm0, %xmm0
2002; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
2003; SSE2-NEXT:    addss %xmm0, %xmm0
2004; SSE2-NEXT:  .LBB42_6:
2005; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2006; SSE2-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
2007; SSE2-NEXT:    retq
2008;
2009; SSE41-LABEL: uitofp_2i64_to_2f32:
2010; SSE41:       # %bb.0:
2011; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,1]
2012; SSE41-NEXT:    pand %xmm0, %xmm1
2013; SSE41-NEXT:    movdqa %xmm0, %xmm2
2014; SSE41-NEXT:    pxor %xmm3, %xmm3
2015; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
2016; SSE41-NEXT:    movdqa %xmm0, %xmm4
2017; SSE41-NEXT:    psrlq $1, %xmm4
2018; SSE41-NEXT:    por %xmm1, %xmm4
2019; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
2020; SSE41-NEXT:    pextrq $1, %xmm2, %rax
2021; SSE41-NEXT:    xorps %xmm0, %xmm0
2022; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
2023; SSE41-NEXT:    movq %xmm2, %rax
2024; SSE41-NEXT:    xorps %xmm1, %xmm1
2025; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
2026; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],zero,zero
2027; SSE41-NEXT:    movaps %xmm1, %xmm2
2028; SSE41-NEXT:    addps %xmm1, %xmm2
2029; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
2030; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm1
2031; SSE41-NEXT:    movq {{.*#+}} xmm0 = xmm1[0],zero
2032; SSE41-NEXT:    retq
2033;
2034; VEX-LABEL: uitofp_2i64_to_2f32:
2035; VEX:       # %bb.0:
2036; VEX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
2037; VEX-NEXT:    vpsrlq $1, %xmm0, %xmm2
2038; VEX-NEXT:    vpor %xmm1, %xmm2, %xmm1
2039; VEX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm1
2040; VEX-NEXT:    vpextrq $1, %xmm1, %rax
2041; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
2042; VEX-NEXT:    vmovq %xmm1, %rax
2043; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
2044; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
2045; VEX-NEXT:    vaddps %xmm1, %xmm1, %xmm2
2046; VEX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
2047; VEX-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0
2048; VEX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
2049; VEX-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
2050; VEX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
2051; VEX-NEXT:    retq
2052;
2053; AVX512F-LABEL: uitofp_2i64_to_2f32:
2054; AVX512F:       # %bb.0:
2055; AVX512F-NEXT:    vmovq %xmm0, %rax
2056; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
2057; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
2058; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
2059; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
2060; AVX512F-NEXT:    retq
2061;
2062; AVX512VL-LABEL: uitofp_2i64_to_2f32:
2063; AVX512VL:       # %bb.0:
2064; AVX512VL-NEXT:    vmovq %xmm0, %rax
2065; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
2066; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
2067; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
2068; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
2069; AVX512VL-NEXT:    retq
2070;
2071; AVX512DQ-LABEL: uitofp_2i64_to_2f32:
2072; AVX512DQ:       # %bb.0:
2073; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2074; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
2075; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
2076; AVX512DQ-NEXT:    vzeroupper
2077; AVX512DQ-NEXT:    retq
2078;
2079; AVX512VLDQ-LABEL: uitofp_2i64_to_2f32:
2080; AVX512VLDQ:       # %bb.0:
2081; AVX512VLDQ-NEXT:    vcvtuqq2ps %xmm0, %xmm0
2082; AVX512VLDQ-NEXT:    retq
2083  %cvt = uitofp <2 x i64> %a to <2 x float>
2084  %ext = shufflevector <2 x float> %cvt, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2085  ret <4 x float> %ext
2086}
2087
2088define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
2089; SSE2-LABEL: uitofp_4i64_to_4f32_undef:
2090; SSE2:       # %bb.0:
2091; SSE2-NEXT:    movq %xmm0, %rax
2092; SSE2-NEXT:    testq %rax, %rax
2093; SSE2-NEXT:    js .LBB43_1
2094; SSE2-NEXT:  # %bb.2:
2095; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
2096; SSE2-NEXT:    jmp .LBB43_3
2097; SSE2-NEXT:  .LBB43_1:
2098; SSE2-NEXT:    movq %rax, %rcx
2099; SSE2-NEXT:    shrq %rcx
2100; SSE2-NEXT:    andl $1, %eax
2101; SSE2-NEXT:    orq %rcx, %rax
2102; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
2103; SSE2-NEXT:    addss %xmm1, %xmm1
2104; SSE2-NEXT:  .LBB43_3:
2105; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
2106; SSE2-NEXT:    movq %xmm0, %rax
2107; SSE2-NEXT:    testq %rax, %rax
2108; SSE2-NEXT:    js .LBB43_4
2109; SSE2-NEXT:  # %bb.5:
2110; SSE2-NEXT:    xorps %xmm0, %xmm0
2111; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
2112; SSE2-NEXT:    jmp .LBB43_6
2113; SSE2-NEXT:  .LBB43_4:
2114; SSE2-NEXT:    movq %rax, %rcx
2115; SSE2-NEXT:    shrq %rcx
2116; SSE2-NEXT:    andl $1, %eax
2117; SSE2-NEXT:    orq %rcx, %rax
2118; SSE2-NEXT:    xorps %xmm0, %xmm0
2119; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
2120; SSE2-NEXT:    addss %xmm0, %xmm0
2121; SSE2-NEXT:  .LBB43_6:
2122; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2123; SSE2-NEXT:    movq {{.*#+}} xmm0 = xmm1[0],zero
2124; SSE2-NEXT:    retq
2125;
2126; SSE41-LABEL: uitofp_4i64_to_4f32_undef:
2127; SSE41:       # %bb.0:
2128; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,1]
2129; SSE41-NEXT:    pand %xmm0, %xmm1
2130; SSE41-NEXT:    movdqa %xmm0, %xmm2
2131; SSE41-NEXT:    psrlq $1, %xmm2
2132; SSE41-NEXT:    por %xmm1, %xmm2
2133; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
2134; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm0
2135; SSE41-NEXT:    pextrq $1, %xmm0, %rax
2136; SSE41-NEXT:    cvtsi2ss %rax, %xmm3
2137; SSE41-NEXT:    movq %xmm0, %rax
2138; SSE41-NEXT:    xorps %xmm2, %xmm2
2139; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
2140; SSE41-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],zero,zero
2141; SSE41-NEXT:    movaps %xmm2, %xmm3
2142; SSE41-NEXT:    addps %xmm2, %xmm3
2143; SSE41-NEXT:    movdqa %xmm1, %xmm0
2144; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm2
2145; SSE41-NEXT:    movaps %xmm2, %xmm0
2146; SSE41-NEXT:    retq
2147;
2148; AVX1-LABEL: uitofp_4i64_to_4f32_undef:
2149; AVX1:       # %bb.0:
2150; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
2151; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm1
2152; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm2
2153; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
2154; AVX1-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm1
2155; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
2156; AVX1-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
2157; AVX1-NEXT:    vmovq %xmm1, %rax
2158; AVX1-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm3
2159; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2160; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2161; AVX1-NEXT:    vmovq %xmm1, %rax
2162; AVX1-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm3
2163; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
2164; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
2165; AVX1-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm1
2166; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
2167; AVX1-NEXT:    vaddps %xmm1, %xmm1, %xmm2
2168; AVX1-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
2169; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
2170; AVX1-NEXT:    vzeroupper
2171; AVX1-NEXT:    retq
2172;
2173; AVX2-LABEL: uitofp_4i64_to_4f32_undef:
2174; AVX2:       # %bb.0:
2175; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
2176; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
2177; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm1
2178; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm2
2179; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
2180; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm1
2181; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
2182; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
2183; AVX2-NEXT:    vmovq %xmm1, %rax
2184; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm3
2185; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2186; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
2187; AVX2-NEXT:    vmovq %xmm1, %rax
2188; AVX2-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm3
2189; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
2190; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
2191; AVX2-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm1
2192; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
2193; AVX2-NEXT:    vaddps %xmm1, %xmm1, %xmm2
2194; AVX2-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
2195; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
2196; AVX2-NEXT:    vzeroupper
2197; AVX2-NEXT:    retq
2198;
2199; AVX512F-LABEL: uitofp_4i64_to_4f32_undef:
2200; AVX512F:       # %bb.0:
2201; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
2202; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
2203; AVX512F-NEXT:    vmovq %xmm0, %rax
2204; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
2205; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
2206; AVX512F-NEXT:    retq
2207;
2208; AVX512VL-LABEL: uitofp_4i64_to_4f32_undef:
2209; AVX512VL:       # %bb.0:
2210; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
2211; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
2212; AVX512VL-NEXT:    vmovq %xmm0, %rax
2213; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
2214; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
2215; AVX512VL-NEXT:    retq
2216;
2217; AVX512DQ-LABEL: uitofp_4i64_to_4f32_undef:
2218; AVX512DQ:       # %bb.0:
2219; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2220; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
2221; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2222; AVX512DQ-NEXT:    vzeroupper
2223; AVX512DQ-NEXT:    retq
2224;
2225; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32_undef:
2226; AVX512VLDQ:       # %bb.0:
2227; AVX512VLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
2228; AVX512VLDQ-NEXT:    vcvtuqq2ps %ymm0, %xmm0
2229; AVX512VLDQ-NEXT:    vzeroupper
2230; AVX512VLDQ-NEXT:    retq
2231  %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2232  %cvt = uitofp <4 x i64> %ext to <4 x float>
2233  ret <4 x float> %cvt
2234}
2235
2236define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) {
2237; SSE2-LABEL: uitofp_4i32_to_4f32:
2238; SSE2:       # %bb.0:
2239; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
2240; SSE2-NEXT:    pand %xmm0, %xmm1
2241; SSE2-NEXT:    por {{.*}}(%rip), %xmm1
2242; SSE2-NEXT:    psrld $16, %xmm0
2243; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
2244; SSE2-NEXT:    subps {{.*}}(%rip), %xmm0
2245; SSE2-NEXT:    addps %xmm1, %xmm0
2246; SSE2-NEXT:    retq
2247;
2248; SSE41-LABEL: uitofp_4i32_to_4f32:
2249; SSE41:       # %bb.0:
2250; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
2251; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
2252; SSE41-NEXT:    psrld $16, %xmm0
2253; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
2254; SSE41-NEXT:    subps {{.*}}(%rip), %xmm0
2255; SSE41-NEXT:    addps %xmm1, %xmm0
2256; SSE41-NEXT:    retq
2257;
2258; AVX1-LABEL: uitofp_4i32_to_4f32:
2259; AVX1:       # %bb.0:
2260; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
2261; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
2262; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
2263; AVX1-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
2264; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
2265; AVX1-NEXT:    retq
2266;
2267; AVX2-LABEL: uitofp_4i32_to_4f32:
2268; AVX2:       # %bb.0:
2269; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
2270; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
2271; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
2272; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928]
2273; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
2274; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
2275; AVX2-NEXT:    vsubps %xmm2, %xmm0, %xmm0
2276; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
2277; AVX2-NEXT:    retq
2278;
2279; AVX512F-LABEL: uitofp_4i32_to_4f32:
2280; AVX512F:       # %bb.0:
2281; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2282; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
2283; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2284; AVX512F-NEXT:    vzeroupper
2285; AVX512F-NEXT:    retq
2286;
2287; AVX512VL-LABEL: uitofp_4i32_to_4f32:
2288; AVX512VL:       # %bb.0:
2289; AVX512VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
2290; AVX512VL-NEXT:    retq
2291;
2292; AVX512DQ-LABEL: uitofp_4i32_to_4f32:
2293; AVX512DQ:       # %bb.0:
2294; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2295; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
2296; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2297; AVX512DQ-NEXT:    vzeroupper
2298; AVX512DQ-NEXT:    retq
2299;
2300; AVX512VLDQ-LABEL: uitofp_4i32_to_4f32:
2301; AVX512VLDQ:       # %bb.0:
2302; AVX512VLDQ-NEXT:    vcvtudq2ps %xmm0, %xmm0
2303; AVX512VLDQ-NEXT:    retq
2304  %cvt = uitofp <4 x i32> %a to <4 x float>
2305  ret <4 x float> %cvt
2306}
2307
2308define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) {
2309; SSE2-LABEL: uitofp_4i16_to_4f32:
2310; SSE2:       # %bb.0:
2311; SSE2-NEXT:    pxor %xmm1, %xmm1
2312; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2313; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
2314; SSE2-NEXT:    retq
2315;
2316; SSE41-LABEL: uitofp_4i16_to_4f32:
2317; SSE41:       # %bb.0:
2318; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2319; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
2320; SSE41-NEXT:    retq
2321;
2322; AVX-LABEL: uitofp_4i16_to_4f32:
2323; AVX:       # %bb.0:
2324; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2325; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
2326; AVX-NEXT:    retq
2327  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2328  %cvt = uitofp <4 x i16> %shuf to <4 x float>
2329  ret <4 x float> %cvt
2330}
2331
2332define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
2333; SSE2-LABEL: uitofp_8i16_to_4f32:
2334; SSE2:       # %bb.0:
2335; SSE2-NEXT:    pxor %xmm1, %xmm1
2336; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2337; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
2338; SSE2-NEXT:    retq
2339;
2340; SSE41-LABEL: uitofp_8i16_to_4f32:
2341; SSE41:       # %bb.0:
2342; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2343; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
2344; SSE41-NEXT:    retq
2345;
2346; AVX1-LABEL: uitofp_8i16_to_4f32:
2347; AVX1:       # %bb.0:
2348; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2349; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2350; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2351; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2352; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
2353; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2354; AVX1-NEXT:    vzeroupper
2355; AVX1-NEXT:    retq
2356;
2357; AVX2-LABEL: uitofp_8i16_to_4f32:
2358; AVX2:       # %bb.0:
2359; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2360; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
2361; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2362; AVX2-NEXT:    vzeroupper
2363; AVX2-NEXT:    retq
2364;
2365; AVX512-LABEL: uitofp_8i16_to_4f32:
2366; AVX512:       # %bb.0:
2367; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2368; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
2369; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2370; AVX512-NEXT:    vzeroupper
2371; AVX512-NEXT:    retq
2372  %cvt = uitofp <8 x i16> %a to <8 x float>
2373  %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2374  ret <4 x float> %shuf
2375}
2376
2377define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) {
2378; SSE2-LABEL: uitofp_4i8_to_4f32:
2379; SSE2:       # %bb.0:
2380; SSE2-NEXT:    pxor %xmm1, %xmm1
2381; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2382; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2383; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
2384; SSE2-NEXT:    retq
2385;
2386; SSE41-LABEL: uitofp_4i8_to_4f32:
2387; SSE41:       # %bb.0:
2388; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2389; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
2390; SSE41-NEXT:    retq
2391;
2392; AVX-LABEL: uitofp_4i8_to_4f32:
2393; AVX:       # %bb.0:
2394; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2395; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
2396; AVX-NEXT:    retq
2397  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2398  %cvt = uitofp <4 x i8> %shuf to <4 x float>
2399  ret <4 x float> %cvt
2400}
2401
2402define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
2403; SSE2-LABEL: uitofp_16i8_to_4f32:
2404; SSE2:       # %bb.0:
2405; SSE2-NEXT:    pxor %xmm1, %xmm1
2406; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2407; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2408; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
2409; SSE2-NEXT:    retq
2410;
2411; SSE41-LABEL: uitofp_16i8_to_4f32:
2412; SSE41:       # %bb.0:
2413; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2414; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
2415; SSE41-NEXT:    retq
2416;
2417; AVX1-LABEL: uitofp_16i8_to_4f32:
2418; AVX1:       # %bb.0:
2419; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2420; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
2421; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2422; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2423; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
2424; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2425; AVX1-NEXT:    vzeroupper
2426; AVX1-NEXT:    retq
2427;
2428; AVX2-LABEL: uitofp_16i8_to_4f32:
2429; AVX2:       # %bb.0:
2430; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2431; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
2432; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2433; AVX2-NEXT:    vzeroupper
2434; AVX2-NEXT:    retq
2435;
2436; AVX512-LABEL: uitofp_16i8_to_4f32:
2437; AVX512:       # %bb.0:
2438; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2439; AVX512-NEXT:    vcvtdq2ps %zmm0, %zmm0
2440; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2441; AVX512-NEXT:    vzeroupper
2442; AVX512-NEXT:    retq
2443  %cvt = uitofp <16 x i8> %a to <16 x float>
2444  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2445  ret <4 x float> %shuf
2446}
2447
2448define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
2449; SSE2-LABEL: uitofp_4i64_to_4f32:
2450; SSE2:       # %bb.0:
2451; SSE2-NEXT:    movq %xmm1, %rax
2452; SSE2-NEXT:    testq %rax, %rax
2453; SSE2-NEXT:    js .LBB49_1
2454; SSE2-NEXT:  # %bb.2:
2455; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
2456; SSE2-NEXT:    jmp .LBB49_3
2457; SSE2-NEXT:  .LBB49_1:
2458; SSE2-NEXT:    movq %rax, %rcx
2459; SSE2-NEXT:    shrq %rcx
2460; SSE2-NEXT:    andl $1, %eax
2461; SSE2-NEXT:    orq %rcx, %rax
2462; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
2463; SSE2-NEXT:    addss %xmm2, %xmm2
2464; SSE2-NEXT:  .LBB49_3:
2465; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
2466; SSE2-NEXT:    movq %xmm1, %rax
2467; SSE2-NEXT:    testq %rax, %rax
2468; SSE2-NEXT:    js .LBB49_4
2469; SSE2-NEXT:  # %bb.5:
2470; SSE2-NEXT:    cvtsi2ss %rax, %xmm3
2471; SSE2-NEXT:    jmp .LBB49_6
2472; SSE2-NEXT:  .LBB49_4:
2473; SSE2-NEXT:    movq %rax, %rcx
2474; SSE2-NEXT:    shrq %rcx
2475; SSE2-NEXT:    andl $1, %eax
2476; SSE2-NEXT:    orq %rcx, %rax
2477; SSE2-NEXT:    cvtsi2ss %rax, %xmm3
2478; SSE2-NEXT:    addss %xmm3, %xmm3
2479; SSE2-NEXT:  .LBB49_6:
2480; SSE2-NEXT:    movq %xmm0, %rax
2481; SSE2-NEXT:    testq %rax, %rax
2482; SSE2-NEXT:    js .LBB49_7
2483; SSE2-NEXT:  # %bb.8:
2484; SSE2-NEXT:    xorps %xmm1, %xmm1
2485; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
2486; SSE2-NEXT:    jmp .LBB49_9
2487; SSE2-NEXT:  .LBB49_7:
2488; SSE2-NEXT:    movq %rax, %rcx
2489; SSE2-NEXT:    shrq %rcx
2490; SSE2-NEXT:    andl $1, %eax
2491; SSE2-NEXT:    orq %rcx, %rax
2492; SSE2-NEXT:    xorps %xmm1, %xmm1
2493; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
2494; SSE2-NEXT:    addss %xmm1, %xmm1
2495; SSE2-NEXT:  .LBB49_9:
2496; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2497; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
2498; SSE2-NEXT:    movq %xmm0, %rax
2499; SSE2-NEXT:    testq %rax, %rax
2500; SSE2-NEXT:    js .LBB49_10
2501; SSE2-NEXT:  # %bb.11:
2502; SSE2-NEXT:    xorps %xmm0, %xmm0
2503; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
2504; SSE2-NEXT:    jmp .LBB49_12
2505; SSE2-NEXT:  .LBB49_10:
2506; SSE2-NEXT:    movq %rax, %rcx
2507; SSE2-NEXT:    shrq %rcx
2508; SSE2-NEXT:    andl $1, %eax
2509; SSE2-NEXT:    orq %rcx, %rax
2510; SSE2-NEXT:    xorps %xmm0, %xmm0
2511; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
2512; SSE2-NEXT:    addss %xmm0, %xmm0
2513; SSE2-NEXT:  .LBB49_12:
2514; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2515; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2516; SSE2-NEXT:    movaps %xmm1, %xmm0
2517; SSE2-NEXT:    retq
2518;
2519; SSE41-LABEL: uitofp_4i64_to_4f32:
2520; SSE41:       # %bb.0:
2521; SSE41-NEXT:    movdqa %xmm1, %xmm2
2522; SSE41-NEXT:    movdqa %xmm0, %xmm1
2523; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [1,1]
2524; SSE41-NEXT:    pand %xmm4, %xmm0
2525; SSE41-NEXT:    movdqa %xmm1, %xmm3
2526; SSE41-NEXT:    psrlq $1, %xmm3
2527; SSE41-NEXT:    por %xmm0, %xmm3
2528; SSE41-NEXT:    movdqa %xmm1, %xmm5
2529; SSE41-NEXT:    movdqa %xmm1, %xmm0
2530; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm5
2531; SSE41-NEXT:    pextrq $1, %xmm5, %rax
2532; SSE41-NEXT:    xorps %xmm0, %xmm0
2533; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
2534; SSE41-NEXT:    movq %xmm5, %rax
2535; SSE41-NEXT:    xorps %xmm3, %xmm3
2536; SSE41-NEXT:    cvtsi2ss %rax, %xmm3
2537; SSE41-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[2,3]
2538; SSE41-NEXT:    pand %xmm2, %xmm4
2539; SSE41-NEXT:    movdqa %xmm2, %xmm5
2540; SSE41-NEXT:    psrlq $1, %xmm5
2541; SSE41-NEXT:    por %xmm4, %xmm5
2542; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
2543; SSE41-NEXT:    movaps %xmm2, %xmm0
2544; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm2
2545; SSE41-NEXT:    movq %xmm2, %rax
2546; SSE41-NEXT:    xorps %xmm0, %xmm0
2547; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
2548; SSE41-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0],xmm3[3]
2549; SSE41-NEXT:    pextrq $1, %xmm2, %rax
2550; SSE41-NEXT:    xorps %xmm0, %xmm0
2551; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
2552; SSE41-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm0[0]
2553; SSE41-NEXT:    movaps %xmm3, %xmm2
2554; SSE41-NEXT:    addps %xmm3, %xmm2
2555; SSE41-NEXT:    movaps %xmm1, %xmm0
2556; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm3
2557; SSE41-NEXT:    movaps %xmm3, %xmm0
2558; SSE41-NEXT:    retq
2559;
2560; AVX1-LABEL: uitofp_4i64_to_4f32:
2561; AVX1:       # %bb.0:
2562; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm1
2563; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2564; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm3
2565; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
2566; AVX1-NEXT:    vandpd {{.*}}(%rip), %ymm0, %ymm3
2567; AVX1-NEXT:    vorpd %ymm3, %ymm1, %ymm1
2568; AVX1-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm1
2569; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
2570; AVX1-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm3
2571; AVX1-NEXT:    vmovq %xmm1, %rax
2572; AVX1-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm4
2573; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
2574; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2575; AVX1-NEXT:    vmovq %xmm1, %rax
2576; AVX1-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
2577; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
2578; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
2579; AVX1-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm1
2580; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
2581; AVX1-NEXT:    vaddps %xmm1, %xmm1, %xmm3
2582; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
2583; AVX1-NEXT:    vblendvps %xmm0, %xmm3, %xmm1, %xmm0
2584; AVX1-NEXT:    vzeroupper
2585; AVX1-NEXT:    retq
2586;
2587; AVX2-LABEL: uitofp_4i64_to_4f32:
2588; AVX2:       # %bb.0:
2589; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
2590; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm1
2591; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm2
2592; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
2593; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm1
2594; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
2595; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
2596; AVX2-NEXT:    vmovq %xmm1, %rax
2597; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm3
2598; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2599; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
2600; AVX2-NEXT:    vmovq %xmm1, %rax
2601; AVX2-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm3
2602; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
2603; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
2604; AVX2-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm1
2605; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
2606; AVX2-NEXT:    vaddps %xmm1, %xmm1, %xmm2
2607; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
2608; AVX2-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0
2609; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
2610; AVX2-NEXT:    vzeroupper
2611; AVX2-NEXT:    retq
2612;
2613; AVX512F-LABEL: uitofp_4i64_to_4f32:
2614; AVX512F:       # %bb.0:
2615; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
2616; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
2617; AVX512F-NEXT:    vmovq %xmm0, %rax
2618; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
2619; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
2620; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
2621; AVX512F-NEXT:    vmovq %xmm0, %rax
2622; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm2
2623; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
2624; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
2625; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm0
2626; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2627; AVX512F-NEXT:    vzeroupper
2628; AVX512F-NEXT:    retq
2629;
2630; AVX512VL-LABEL: uitofp_4i64_to_4f32:
2631; AVX512VL:       # %bb.0:
2632; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
2633; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
2634; AVX512VL-NEXT:    vmovq %xmm0, %rax
2635; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
2636; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
2637; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
2638; AVX512VL-NEXT:    vmovq %xmm0, %rax
2639; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm2
2640; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
2641; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
2642; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm0
2643; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2644; AVX512VL-NEXT:    vzeroupper
2645; AVX512VL-NEXT:    retq
2646;
2647; AVX512DQ-LABEL: uitofp_4i64_to_4f32:
2648; AVX512DQ:       # %bb.0:
2649; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2650; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
2651; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2652; AVX512DQ-NEXT:    vzeroupper
2653; AVX512DQ-NEXT:    retq
2654;
2655; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32:
2656; AVX512VLDQ:       # %bb.0:
2657; AVX512VLDQ-NEXT:    vcvtuqq2ps %ymm0, %xmm0
2658; AVX512VLDQ-NEXT:    vzeroupper
2659; AVX512VLDQ-NEXT:    retq
2660  %cvt = uitofp <4 x i64> %a to <4 x float>
2661  ret <4 x float> %cvt
2662}
2663
2664define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) {
2665; SSE2-LABEL: uitofp_8i32_to_8f32:
2666; SSE2:       # %bb.0:
2667; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
2668; SSE2-NEXT:    movdqa %xmm0, %xmm3
2669; SSE2-NEXT:    pand %xmm2, %xmm3
2670; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
2671; SSE2-NEXT:    por %xmm4, %xmm3
2672; SSE2-NEXT:    psrld $16, %xmm0
2673; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
2674; SSE2-NEXT:    por %xmm5, %xmm0
2675; SSE2-NEXT:    movaps {{.*#+}} xmm6 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
2676; SSE2-NEXT:    subps %xmm6, %xmm0
2677; SSE2-NEXT:    addps %xmm3, %xmm0
2678; SSE2-NEXT:    pand %xmm1, %xmm2
2679; SSE2-NEXT:    por %xmm4, %xmm2
2680; SSE2-NEXT:    psrld $16, %xmm1
2681; SSE2-NEXT:    por %xmm5, %xmm1
2682; SSE2-NEXT:    subps %xmm6, %xmm1
2683; SSE2-NEXT:    addps %xmm2, %xmm1
2684; SSE2-NEXT:    retq
2685;
2686; SSE41-LABEL: uitofp_8i32_to_8f32:
2687; SSE41:       # %bb.0:
2688; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200]
2689; SSE41-NEXT:    movdqa %xmm0, %xmm3
2690; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
2691; SSE41-NEXT:    psrld $16, %xmm0
2692; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928]
2693; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
2694; SSE41-NEXT:    movaps {{.*#+}} xmm5 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
2695; SSE41-NEXT:    subps %xmm5, %xmm0
2696; SSE41-NEXT:    addps %xmm3, %xmm0
2697; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
2698; SSE41-NEXT:    psrld $16, %xmm1
2699; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
2700; SSE41-NEXT:    subps %xmm5, %xmm1
2701; SSE41-NEXT:    addps %xmm2, %xmm1
2702; SSE41-NEXT:    retq
2703;
2704; AVX1-LABEL: uitofp_8i32_to_8f32:
2705; AVX1:       # %bb.0:
2706; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
2707; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2708; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
2709; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2710; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
2711; AVX1-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
2712; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
2713; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
2714; AVX1-NEXT:    vaddps %ymm0, %ymm1, %ymm0
2715; AVX1-NEXT:    retq
2716;
2717; AVX2-LABEL: uitofp_8i32_to_8f32:
2718; AVX2:       # %bb.0:
2719; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
2720; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
2721; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
2722; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
2723; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
2724; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
2725; AVX2-NEXT:    vsubps %ymm2, %ymm0, %ymm0
2726; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
2727; AVX2-NEXT:    retq
2728;
2729; AVX512F-LABEL: uitofp_8i32_to_8f32:
2730; AVX512F:       # %bb.0:
2731; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2732; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
2733; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2734; AVX512F-NEXT:    retq
2735;
2736; AVX512VL-LABEL: uitofp_8i32_to_8f32:
2737; AVX512VL:       # %bb.0:
2738; AVX512VL-NEXT:    vcvtudq2ps %ymm0, %ymm0
2739; AVX512VL-NEXT:    retq
2740;
2741; AVX512DQ-LABEL: uitofp_8i32_to_8f32:
2742; AVX512DQ:       # %bb.0:
2743; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2744; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
2745; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2746; AVX512DQ-NEXT:    retq
2747;
2748; AVX512VLDQ-LABEL: uitofp_8i32_to_8f32:
2749; AVX512VLDQ:       # %bb.0:
2750; AVX512VLDQ-NEXT:    vcvtudq2ps %ymm0, %ymm0
2751; AVX512VLDQ-NEXT:    retq
2752  %cvt = uitofp <8 x i32> %a to <8 x float>
2753  ret <8 x float> %cvt
2754}
2755
2756define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
2757; SSE2-LABEL: uitofp_8i16_to_8f32:
2758; SSE2:       # %bb.0:
2759; SSE2-NEXT:    pxor %xmm1, %xmm1
2760; SSE2-NEXT:    movdqa %xmm0, %xmm2
2761; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2762; SSE2-NEXT:    cvtdq2ps %xmm2, %xmm2
2763; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2764; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
2765; SSE2-NEXT:    movaps %xmm2, %xmm0
2766; SSE2-NEXT:    retq
2767;
2768; SSE41-LABEL: uitofp_8i16_to_8f32:
2769; SSE41:       # %bb.0:
2770; SSE41-NEXT:    pxor %xmm1, %xmm1
2771; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2772; SSE41-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2773; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm1
2774; SSE41-NEXT:    cvtdq2ps %xmm2, %xmm0
2775; SSE41-NEXT:    retq
2776;
2777; AVX1-LABEL: uitofp_8i16_to_8f32:
2778; AVX1:       # %bb.0:
2779; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2780; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2781; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2782; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2783; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
2784; AVX1-NEXT:    retq
2785;
2786; AVX2-LABEL: uitofp_8i16_to_8f32:
2787; AVX2:       # %bb.0:
2788; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2789; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
2790; AVX2-NEXT:    retq
2791;
2792; AVX512-LABEL: uitofp_8i16_to_8f32:
2793; AVX512:       # %bb.0:
2794; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2795; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
2796; AVX512-NEXT:    retq
2797  %cvt = uitofp <8 x i16> %a to <8 x float>
2798  ret <8 x float> %cvt
2799}
2800
2801define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
2802; SSE2-LABEL: uitofp_8i8_to_8f32:
2803; SSE2:       # %bb.0:
2804; SSE2-NEXT:    pxor %xmm1, %xmm1
2805; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2806; SSE2-NEXT:    movdqa %xmm0, %xmm2
2807; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2808; SSE2-NEXT:    cvtdq2ps %xmm2, %xmm2
2809; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2810; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
2811; SSE2-NEXT:    movaps %xmm2, %xmm0
2812; SSE2-NEXT:    retq
2813;
2814; SSE41-LABEL: uitofp_8i8_to_8f32:
2815; SSE41:       # %bb.0:
2816; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2817; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm2
2818; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
2819; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2820; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm1
2821; SSE41-NEXT:    movaps %xmm2, %xmm0
2822; SSE41-NEXT:    retq
2823;
2824; AVX1-LABEL: uitofp_8i8_to_8f32:
2825; AVX1:       # %bb.0:
2826; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2827; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
2828; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2829; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2830; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
2831; AVX1-NEXT:    retq
2832;
2833; AVX2-LABEL: uitofp_8i8_to_8f32:
2834; AVX2:       # %bb.0:
2835; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2836; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
2837; AVX2-NEXT:    retq
2838;
2839; AVX512-LABEL: uitofp_8i8_to_8f32:
2840; AVX512:       # %bb.0:
2841; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2842; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
2843; AVX512-NEXT:    retq
2844  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2845  %cvt = uitofp <8 x i8> %shuf to <8 x float>
2846  ret <8 x float> %cvt
2847}
2848
2849define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
2850; SSE2-LABEL: uitofp_16i8_to_8f32:
2851; SSE2:       # %bb.0:
2852; SSE2-NEXT:    pxor %xmm1, %xmm1
2853; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2854; SSE2-NEXT:    movdqa %xmm0, %xmm2
2855; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2856; SSE2-NEXT:    cvtdq2ps %xmm2, %xmm2
2857; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2858; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
2859; SSE2-NEXT:    movaps %xmm2, %xmm0
2860; SSE2-NEXT:    retq
2861;
2862; SSE41-LABEL: uitofp_16i8_to_8f32:
2863; SSE41:       # %bb.0:
2864; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2865; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm2
2866; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
2867; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2868; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm1
2869; SSE41-NEXT:    movaps %xmm2, %xmm0
2870; SSE41-NEXT:    retq
2871;
2872; AVX1-LABEL: uitofp_16i8_to_8f32:
2873; AVX1:       # %bb.0:
2874; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2875; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
2876; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2877; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2878; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
2879; AVX1-NEXT:    retq
2880;
2881; AVX2-LABEL: uitofp_16i8_to_8f32:
2882; AVX2:       # %bb.0:
2883; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2884; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
2885; AVX2-NEXT:    retq
2886;
2887; AVX512-LABEL: uitofp_16i8_to_8f32:
2888; AVX512:       # %bb.0:
2889; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2890; AVX512-NEXT:    vcvtdq2ps %zmm0, %zmm0
2891; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2892; AVX512-NEXT:    retq
2893  %cvt = uitofp <16 x i8> %a to <16 x float>
2894  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2895  ret <8 x float> %shuf
2896}
2897
2898;
2899; Load Signed Integer to Double
2900;
2901
2902define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
2903; SSE2-LABEL: sitofp_load_2i64_to_2f64:
2904; SSE2:       # %bb.0:
2905; SSE2-NEXT:    movdqa (%rdi), %xmm1
2906; SSE2-NEXT:    movq %xmm1, %rax
2907; SSE2-NEXT:    cvtsi2sd %rax, %xmm0
2908; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
2909; SSE2-NEXT:    movq %xmm1, %rax
2910; SSE2-NEXT:    xorps %xmm1, %xmm1
2911; SSE2-NEXT:    cvtsi2sd %rax, %xmm1
2912; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2913; SSE2-NEXT:    retq
2914;
2915; SSE41-LABEL: sitofp_load_2i64_to_2f64:
2916; SSE41:       # %bb.0:
2917; SSE41-NEXT:    movdqa (%rdi), %xmm0
2918; SSE41-NEXT:    pextrq $1, %xmm0, %rax
2919; SSE41-NEXT:    cvtsi2sd %rax, %xmm1
2920; SSE41-NEXT:    movq %xmm0, %rax
2921; SSE41-NEXT:    xorps %xmm0, %xmm0
2922; SSE41-NEXT:    cvtsi2sd %rax, %xmm0
2923; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2924; SSE41-NEXT:    retq
2925;
2926; VEX-LABEL: sitofp_load_2i64_to_2f64:
2927; VEX:       # %bb.0:
2928; VEX-NEXT:    vmovdqa (%rdi), %xmm0
2929; VEX-NEXT:    vpextrq $1, %xmm0, %rax
2930; VEX-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
2931; VEX-NEXT:    vmovq %xmm0, %rax
2932; VEX-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
2933; VEX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2934; VEX-NEXT:    retq
2935;
2936; AVX512F-LABEL: sitofp_load_2i64_to_2f64:
2937; AVX512F:       # %bb.0:
2938; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
2939; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
2940; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
2941; AVX512F-NEXT:    vmovq %xmm0, %rax
2942; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
2943; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2944; AVX512F-NEXT:    retq
2945;
2946; AVX512VL-LABEL: sitofp_load_2i64_to_2f64:
2947; AVX512VL:       # %bb.0:
2948; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
2949; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
2950; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
2951; AVX512VL-NEXT:    vmovq %xmm0, %rax
2952; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
2953; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2954; AVX512VL-NEXT:    retq
2955;
2956; AVX512DQ-LABEL: sitofp_load_2i64_to_2f64:
2957; AVX512DQ:       # %bb.0:
2958; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
2959; AVX512DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
2960; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2961; AVX512DQ-NEXT:    vzeroupper
2962; AVX512DQ-NEXT:    retq
2963;
2964; AVX512VLDQ-LABEL: sitofp_load_2i64_to_2f64:
2965; AVX512VLDQ:       # %bb.0:
2966; AVX512VLDQ-NEXT:    vcvtqq2pd (%rdi), %xmm0
2967; AVX512VLDQ-NEXT:    retq
2968  %ld = load <2 x i64>, <2 x i64> *%a
2969  %cvt = sitofp <2 x i64> %ld to <2 x double>
2970  ret <2 x double> %cvt
2971}
2972
2973define <2 x double> @sitofp_load_2i32_to_2f64(<2 x i32> *%a) {
2974; SSE-LABEL: sitofp_load_2i32_to_2f64:
2975; SSE:       # %bb.0:
2976; SSE-NEXT:    cvtdq2pd (%rdi), %xmm0
2977; SSE-NEXT:    retq
2978;
2979; AVX-LABEL: sitofp_load_2i32_to_2f64:
2980; AVX:       # %bb.0:
2981; AVX-NEXT:    vcvtdq2pd (%rdi), %xmm0
2982; AVX-NEXT:    retq
2983  %ld = load <2 x i32>, <2 x i32> *%a
2984  %cvt = sitofp <2 x i32> %ld to <2 x double>
2985  ret <2 x double> %cvt
2986}
2987
2988define <2 x double> @sitofp_volatile_load_4i32_to_2f64(<4 x i32> *%a) {
2989; SSE-LABEL: sitofp_volatile_load_4i32_to_2f64:
2990; SSE:       # %bb.0:
2991; SSE-NEXT:    movaps (%rdi), %xmm0
2992; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
2993; SSE-NEXT:    retq
2994;
2995; AVX-LABEL: sitofp_volatile_load_4i32_to_2f64:
2996; AVX:       # %bb.0:
2997; AVX-NEXT:    vmovaps (%rdi), %xmm0
2998; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
2999; AVX-NEXT:    retq
3000  %ld = load volatile <4 x i32>, <4 x i32> *%a
3001  %b = shufflevector <4 x i32> %ld, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
3002  %cvt = sitofp <2 x i32> %b to <2 x double>
3003  ret <2 x double> %cvt
3004}
3005
3006define <2 x double> @sitofp_load_4i32_to_2f64_2(<4 x i32>* %x) {
3007; SSE-LABEL: sitofp_load_4i32_to_2f64_2:
3008; SSE:       # %bb.0:
3009; SSE-NEXT:    cvtdq2pd (%rdi), %xmm0
3010; SSE-NEXT:    retq
3011;
3012; AVX-LABEL: sitofp_load_4i32_to_2f64_2:
3013; AVX:       # %bb.0:
3014; AVX-NEXT:    vcvtdq2pd (%rdi), %xmm0
3015; AVX-NEXT:    retq
3016  %a = load <4 x i32>, <4 x i32>* %x
3017  %b = sitofp <4 x i32> %a to <4 x double>
3018  %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> <i32 0, i32 1>
3019  ret <2 x double> %c
3020}
3021
3022define <2 x double> @sitofp_volatile_load_4i32_to_2f64_2(<4 x i32>* %x) {
3023; SSE-LABEL: sitofp_volatile_load_4i32_to_2f64_2:
3024; SSE:       # %bb.0:
3025; SSE-NEXT:    movaps (%rdi), %xmm0
3026; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
3027; SSE-NEXT:    retq
3028;
3029; AVX-LABEL: sitofp_volatile_load_4i32_to_2f64_2:
3030; AVX:       # %bb.0:
3031; AVX-NEXT:    vmovaps (%rdi), %xmm0
3032; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
3033; AVX-NEXT:    retq
3034  %a = load volatile <4 x i32>, <4 x i32>* %x
3035  %b = sitofp <4 x i32> %a to <4 x double>
3036  %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> <i32 0, i32 1>
3037  ret <2 x double> %c
3038}
3039
3040define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) {
3041; SSE2-LABEL: sitofp_load_2i16_to_2f64:
3042; SSE2:       # %bb.0:
3043; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3044; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
3045; SSE2-NEXT:    psrad $16, %xmm0
3046; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
3047; SSE2-NEXT:    retq
3048;
3049; SSE41-LABEL: sitofp_load_2i16_to_2f64:
3050; SSE41:       # %bb.0:
3051; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3052; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
3053; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
3054; SSE41-NEXT:    retq
3055;
3056; AVX-LABEL: sitofp_load_2i16_to_2f64:
3057; AVX:       # %bb.0:
3058; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3059; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
3060; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
3061; AVX-NEXT:    retq
3062  %ld = load <2 x i16>, <2 x i16> *%a
3063  %cvt = sitofp <2 x i16> %ld to <2 x double>
3064  ret <2 x double> %cvt
3065}
3066
3067define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) {
3068; SSE2-LABEL: sitofp_load_2i8_to_2f64:
3069; SSE2:       # %bb.0:
3070; SSE2-NEXT:    movzwl (%rdi), %eax
3071; SSE2-NEXT:    movd %eax, %xmm0
3072; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3073; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
3074; SSE2-NEXT:    psrad $24, %xmm0
3075; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
3076; SSE2-NEXT:    retq
3077;
3078; SSE41-LABEL: sitofp_load_2i8_to_2f64:
3079; SSE41:       # %bb.0:
3080; SSE41-NEXT:    movzwl (%rdi), %eax
3081; SSE41-NEXT:    movd %eax, %xmm0
3082; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
3083; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
3084; SSE41-NEXT:    retq
3085;
3086; AVX-LABEL: sitofp_load_2i8_to_2f64:
3087; AVX:       # %bb.0:
3088; AVX-NEXT:    movzwl (%rdi), %eax
3089; AVX-NEXT:    vmovd %eax, %xmm0
3090; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
3091; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
3092; AVX-NEXT:    retq
3093  %ld = load <2 x i8>, <2 x i8> *%a
3094  %cvt = sitofp <2 x i8> %ld to <2 x double>
3095  ret <2 x double> %cvt
3096}
3097
3098define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
3099; SSE2-LABEL: sitofp_load_4i64_to_4f64:
3100; SSE2:       # %bb.0:
3101; SSE2-NEXT:    movdqa (%rdi), %xmm1
3102; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
3103; SSE2-NEXT:    movq %xmm1, %rax
3104; SSE2-NEXT:    cvtsi2sd %rax, %xmm0
3105; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
3106; SSE2-NEXT:    movq %xmm1, %rax
3107; SSE2-NEXT:    xorps %xmm1, %xmm1
3108; SSE2-NEXT:    cvtsi2sd %rax, %xmm1
3109; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3110; SSE2-NEXT:    movq %xmm2, %rax
3111; SSE2-NEXT:    xorps %xmm1, %xmm1
3112; SSE2-NEXT:    cvtsi2sd %rax, %xmm1
3113; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
3114; SSE2-NEXT:    movq %xmm2, %rax
3115; SSE2-NEXT:    xorps %xmm2, %xmm2
3116; SSE2-NEXT:    cvtsi2sd %rax, %xmm2
3117; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
3118; SSE2-NEXT:    retq
3119;
3120; SSE41-LABEL: sitofp_load_4i64_to_4f64:
3121; SSE41:       # %bb.0:
3122; SSE41-NEXT:    movdqa (%rdi), %xmm0
3123; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
3124; SSE41-NEXT:    pextrq $1, %xmm0, %rax
3125; SSE41-NEXT:    cvtsi2sd %rax, %xmm2
3126; SSE41-NEXT:    movq %xmm0, %rax
3127; SSE41-NEXT:    xorps %xmm0, %xmm0
3128; SSE41-NEXT:    cvtsi2sd %rax, %xmm0
3129; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3130; SSE41-NEXT:    pextrq $1, %xmm1, %rax
3131; SSE41-NEXT:    xorps %xmm2, %xmm2
3132; SSE41-NEXT:    cvtsi2sd %rax, %xmm2
3133; SSE41-NEXT:    movq %xmm1, %rax
3134; SSE41-NEXT:    xorps %xmm1, %xmm1
3135; SSE41-NEXT:    cvtsi2sd %rax, %xmm1
3136; SSE41-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
3137; SSE41-NEXT:    retq
3138;
3139; VEX-LABEL: sitofp_load_4i64_to_4f64:
3140; VEX:       # %bb.0:
3141; VEX-NEXT:    vmovapd (%rdi), %xmm0
3142; VEX-NEXT:    vmovdqa 16(%rdi), %xmm1
3143; VEX-NEXT:    vpextrq $1, %xmm1, %rax
3144; VEX-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
3145; VEX-NEXT:    vmovq %xmm1, %rax
3146; VEX-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
3147; VEX-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
3148; VEX-NEXT:    vpextrq $1, %xmm0, %rax
3149; VEX-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
3150; VEX-NEXT:    vmovq %xmm0, %rax
3151; VEX-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
3152; VEX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3153; VEX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3154; VEX-NEXT:    retq
3155;
3156; AVX512F-LABEL: sitofp_load_4i64_to_4f64:
3157; AVX512F:       # %bb.0:
3158; AVX512F-NEXT:    vmovapd (%rdi), %xmm0
3159; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
3160; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
3161; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
3162; AVX512F-NEXT:    vmovq %xmm1, %rax
3163; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
3164; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
3165; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
3166; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
3167; AVX512F-NEXT:    vmovq %xmm0, %rax
3168; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
3169; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3170; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3171; AVX512F-NEXT:    retq
3172;
3173; AVX512VL-LABEL: sitofp_load_4i64_to_4f64:
3174; AVX512VL:       # %bb.0:
3175; AVX512VL-NEXT:    vmovapd (%rdi), %xmm0
3176; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
3177; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
3178; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
3179; AVX512VL-NEXT:    vmovq %xmm1, %rax
3180; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
3181; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
3182; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
3183; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
3184; AVX512VL-NEXT:    vmovq %xmm0, %rax
3185; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
3186; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3187; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3188; AVX512VL-NEXT:    retq
3189;
3190; AVX512DQ-LABEL: sitofp_load_4i64_to_4f64:
3191; AVX512DQ:       # %bb.0:
3192; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
3193; AVX512DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
3194; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3195; AVX512DQ-NEXT:    retq
3196;
3197; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f64:
3198; AVX512VLDQ:       # %bb.0:
3199; AVX512VLDQ-NEXT:    vcvtqq2pd (%rdi), %ymm0
3200; AVX512VLDQ-NEXT:    retq
3201  %ld = load <4 x i64>, <4 x i64> *%a
3202  %cvt = sitofp <4 x i64> %ld to <4 x double>
3203  ret <4 x double> %cvt
3204}
3205
3206define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) {
3207; SSE-LABEL: sitofp_load_4i32_to_4f64:
3208; SSE:       # %bb.0:
3209; SSE-NEXT:    movdqa (%rdi), %xmm1
3210; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
3211; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
3212; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
3213; SSE-NEXT:    retq
3214;
3215; AVX-LABEL: sitofp_load_4i32_to_4f64:
3216; AVX:       # %bb.0:
3217; AVX-NEXT:    vcvtdq2pd (%rdi), %ymm0
3218; AVX-NEXT:    retq
3219  %ld = load <4 x i32>, <4 x i32> *%a
3220  %cvt = sitofp <4 x i32> %ld to <4 x double>
3221  ret <4 x double> %cvt
3222}
3223
3224define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) {
3225; SSE2-LABEL: sitofp_load_4i16_to_4f64:
3226; SSE2:       # %bb.0:
3227; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3228; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3229; SSE2-NEXT:    psrad $16, %xmm1
3230; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
3231; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
3232; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
3233; SSE2-NEXT:    retq
3234;
3235; SSE41-LABEL: sitofp_load_4i16_to_4f64:
3236; SSE41:       # %bb.0:
3237; SSE41-NEXT:    pmovsxwd (%rdi), %xmm1
3238; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
3239; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
3240; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
3241; SSE41-NEXT:    retq
3242;
3243; AVX-LABEL: sitofp_load_4i16_to_4f64:
3244; AVX:       # %bb.0:
3245; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
3246; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
3247; AVX-NEXT:    retq
3248  %ld = load <4 x i16>, <4 x i16> *%a
3249  %cvt = sitofp <4 x i16> %ld to <4 x double>
3250  ret <4 x double> %cvt
3251}
3252
3253define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) {
3254; SSE2-LABEL: sitofp_load_4i8_to_4f64:
3255; SSE2:       # %bb.0:
3256; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3257; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3258; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3259; SSE2-NEXT:    psrad $24, %xmm1
3260; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
3261; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
3262; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
3263; SSE2-NEXT:    retq
3264;
3265; SSE41-LABEL: sitofp_load_4i8_to_4f64:
3266; SSE41:       # %bb.0:
3267; SSE41-NEXT:    pmovsxbd (%rdi), %xmm1
3268; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
3269; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
3270; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
3271; SSE41-NEXT:    retq
3272;
3273; AVX-LABEL: sitofp_load_4i8_to_4f64:
3274; AVX:       # %bb.0:
3275; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
3276; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
3277; AVX-NEXT:    retq
3278  %ld = load <4 x i8>, <4 x i8> *%a
3279  %cvt = sitofp <4 x i8> %ld to <4 x double>
3280  ret <4 x double> %cvt
3281}
3282
3283;
3284; Load Unsigned Integer to Double
3285;
3286
3287define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) {
3288; SSE2-LABEL: uitofp_load_2i64_to_2f64:
3289; SSE2:       # %bb.0:
3290; SSE2-NEXT:    movdqa (%rdi), %xmm0
3291; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
3292; SSE2-NEXT:    pand %xmm0, %xmm1
3293; SSE2-NEXT:    por {{.*}}(%rip), %xmm1
3294; SSE2-NEXT:    psrlq $32, %xmm0
3295; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
3296; SSE2-NEXT:    subpd {{.*}}(%rip), %xmm0
3297; SSE2-NEXT:    addpd %xmm1, %xmm0
3298; SSE2-NEXT:    retq
3299;
3300; SSE41-LABEL: uitofp_load_2i64_to_2f64:
3301; SSE41:       # %bb.0:
3302; SSE41-NEXT:    movdqa (%rdi), %xmm0
3303; SSE41-NEXT:    pxor %xmm1, %xmm1
3304; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
3305; SSE41-NEXT:    por {{.*}}(%rip), %xmm1
3306; SSE41-NEXT:    psrlq $32, %xmm0
3307; SSE41-NEXT:    por {{.*}}(%rip), %xmm0
3308; SSE41-NEXT:    subpd {{.*}}(%rip), %xmm0
3309; SSE41-NEXT:    addpd %xmm1, %xmm0
3310; SSE41-NEXT:    retq
3311;
3312; AVX1-LABEL: uitofp_load_2i64_to_2f64:
3313; AVX1:       # %bb.0:
3314; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
3315; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3316; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
3317; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
3318; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
3319; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
3320; AVX1-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
3321; AVX1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
3322; AVX1-NEXT:    retq
3323;
3324; AVX2-LABEL: uitofp_load_2i64_to_2f64:
3325; AVX2:       # %bb.0:
3326; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
3327; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3328; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
3329; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
3330; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
3331; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
3332; AVX2-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
3333; AVX2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
3334; AVX2-NEXT:    retq
3335;
3336; AVX512F-LABEL: uitofp_load_2i64_to_2f64:
3337; AVX512F:       # %bb.0:
3338; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
3339; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3340; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
3341; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
3342; AVX512F-NEXT:    vpsrlq $32, %xmm0, %xmm0
3343; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
3344; AVX512F-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
3345; AVX512F-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
3346; AVX512F-NEXT:    retq
3347;
3348; AVX512VL-LABEL: uitofp_load_2i64_to_2f64:
3349; AVX512VL:       # %bb.0:
3350; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
3351; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3352; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
3353; AVX512VL-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
3354; AVX512VL-NEXT:    vpsrlq $32, %xmm0, %xmm0
3355; AVX512VL-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
3356; AVX512VL-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
3357; AVX512VL-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
3358; AVX512VL-NEXT:    retq
3359;
3360; AVX512DQ-LABEL: uitofp_load_2i64_to_2f64:
3361; AVX512DQ:       # %bb.0:
3362; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
3363; AVX512DQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
3364; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3365; AVX512DQ-NEXT:    vzeroupper
3366; AVX512DQ-NEXT:    retq
3367;
3368; AVX512VLDQ-LABEL: uitofp_load_2i64_to_2f64:
3369; AVX512VLDQ:       # %bb.0:
3370; AVX512VLDQ-NEXT:    vcvtuqq2pd (%rdi), %xmm0
3371; AVX512VLDQ-NEXT:    retq
3372  %ld = load <2 x i64>, <2 x i64> *%a
3373  %cvt = uitofp <2 x i64> %ld to <2 x double>
3374  ret <2 x double> %cvt
3375}
3376
3377define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) {
3378; SSE2-LABEL: uitofp_load_2i32_to_2f64:
3379; SSE2:       # %bb.0:
3380; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
3381; SSE2-NEXT:    xorpd %xmm1, %xmm1
3382; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3383; SSE2-NEXT:    movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
3384; SSE2-NEXT:    orpd %xmm1, %xmm0
3385; SSE2-NEXT:    subpd %xmm1, %xmm0
3386; SSE2-NEXT:    retq
3387;
3388; SSE41-LABEL: uitofp_load_2i32_to_2f64:
3389; SSE41:       # %bb.0:
3390; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
3391; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
3392; SSE41-NEXT:    por %xmm1, %xmm0
3393; SSE41-NEXT:    subpd %xmm1, %xmm0
3394; SSE41-NEXT:    retq
3395;
3396; VEX-LABEL: uitofp_load_2i32_to_2f64:
3397; VEX:       # %bb.0:
3398; VEX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
3399; VEX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
3400; VEX-NEXT:    vpor %xmm1, %xmm0, %xmm0
3401; VEX-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
3402; VEX-NEXT:    retq
3403;
3404; AVX512F-LABEL: uitofp_load_2i32_to_2f64:
3405; AVX512F:       # %bb.0:
3406; AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
3407; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
3408; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3409; AVX512F-NEXT:    vzeroupper
3410; AVX512F-NEXT:    retq
3411;
3412; AVX512VL-LABEL: uitofp_load_2i32_to_2f64:
3413; AVX512VL:       # %bb.0:
3414; AVX512VL-NEXT:    vcvtudq2pd (%rdi), %xmm0
3415; AVX512VL-NEXT:    retq
3416;
3417; AVX512DQ-LABEL: uitofp_load_2i32_to_2f64:
3418; AVX512DQ:       # %bb.0:
3419; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
3420; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
3421; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3422; AVX512DQ-NEXT:    vzeroupper
3423; AVX512DQ-NEXT:    retq
3424;
3425; AVX512VLDQ-LABEL: uitofp_load_2i32_to_2f64:
3426; AVX512VLDQ:       # %bb.0:
3427; AVX512VLDQ-NEXT:    vcvtudq2pd (%rdi), %xmm0
3428; AVX512VLDQ-NEXT:    retq
3429  %ld = load <2 x i32>, <2 x i32> *%a
3430  %cvt = uitofp <2 x i32> %ld to <2 x double>
3431  ret <2 x double> %cvt
3432}
3433
3434define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) {
3435; SSE2-LABEL: uitofp_load_4i32_to_2f64_2:
3436; SSE2:       # %bb.0:
3437; SSE2-NEXT:    movapd (%rdi), %xmm0
3438; SSE2-NEXT:    xorpd %xmm1, %xmm1
3439; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3440; SSE2-NEXT:    movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
3441; SSE2-NEXT:    orpd %xmm1, %xmm0
3442; SSE2-NEXT:    subpd %xmm1, %xmm0
3443; SSE2-NEXT:    retq
3444;
3445; SSE41-LABEL: uitofp_load_4i32_to_2f64_2:
3446; SSE41:       # %bb.0:
3447; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
3448; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
3449; SSE41-NEXT:    por %xmm1, %xmm0
3450; SSE41-NEXT:    subpd %xmm1, %xmm0
3451; SSE41-NEXT:    retq
3452;
3453; AVX1-LABEL: uitofp_load_4i32_to_2f64_2:
3454; AVX1:       # %bb.0:
3455; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
3456; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
3457; AVX1-NEXT:    # xmm1 = mem[0,0]
3458; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
3459; AVX1-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
3460; AVX1-NEXT:    retq
3461;
3462; AVX2-LABEL: uitofp_load_4i32_to_2f64_2:
3463; AVX2:       # %bb.0:
3464; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
3465; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
3466; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
3467; AVX2-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
3468; AVX2-NEXT:    vzeroupper
3469; AVX2-NEXT:    retq
3470;
3471; AVX512F-LABEL: uitofp_load_4i32_to_2f64_2:
3472; AVX512F:       # %bb.0:
3473; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
3474; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
3475; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3476; AVX512F-NEXT:    vzeroupper
3477; AVX512F-NEXT:    retq
3478;
3479; AVX512VL-LABEL: uitofp_load_4i32_to_2f64_2:
3480; AVX512VL:       # %bb.0:
3481; AVX512VL-NEXT:    vcvtudq2pd (%rdi), %xmm0
3482; AVX512VL-NEXT:    retq
3483;
3484; AVX512DQ-LABEL: uitofp_load_4i32_to_2f64_2:
3485; AVX512DQ:       # %bb.0:
3486; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
3487; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
3488; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3489; AVX512DQ-NEXT:    vzeroupper
3490; AVX512DQ-NEXT:    retq
3491;
3492; AVX512VLDQ-LABEL: uitofp_load_4i32_to_2f64_2:
3493; AVX512VLDQ:       # %bb.0:
3494; AVX512VLDQ-NEXT:    vcvtudq2pd (%rdi), %xmm0
3495; AVX512VLDQ-NEXT:    retq
3496  %a = load <4 x i32>, <4 x i32>* %x
3497  %b = uitofp <4 x i32> %a to <4 x double>
3498  %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> <i32 0, i32 1>
3499  ret <2 x double> %c
3500}
3501
3502define <2 x double> @uitofp_volatile_load_4i32_to_2f64_2(<4 x i32>* %x) {
3503; SSE2-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
3504; SSE2:       # %bb.0:
3505; SSE2-NEXT:    movapd (%rdi), %xmm0
3506; SSE2-NEXT:    xorpd %xmm1, %xmm1
3507; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3508; SSE2-NEXT:    movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
3509; SSE2-NEXT:    orpd %xmm1, %xmm0
3510; SSE2-NEXT:    subpd %xmm1, %xmm0
3511; SSE2-NEXT:    retq
3512;
3513; SSE41-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
3514; SSE41:       # %bb.0:
3515; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
3516; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
3517; SSE41-NEXT:    por %xmm1, %xmm0
3518; SSE41-NEXT:    subpd %xmm1, %xmm0
3519; SSE41-NEXT:    retq
3520;
3521; AVX1-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
3522; AVX1:       # %bb.0:
3523; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
3524; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
3525; AVX1-NEXT:    # xmm1 = mem[0,0]
3526; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
3527; AVX1-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
3528; AVX1-NEXT:    retq
3529;
3530; AVX2-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
3531; AVX2:       # %bb.0:
3532; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
3533; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
3534; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
3535; AVX2-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
3536; AVX2-NEXT:    vzeroupper
3537; AVX2-NEXT:    retq
3538;
3539; AVX512F-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
3540; AVX512F:       # %bb.0:
3541; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
3542; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
3543; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3544; AVX512F-NEXT:    vzeroupper
3545; AVX512F-NEXT:    retq
3546;
3547; AVX512VL-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
3548; AVX512VL:       # %bb.0:
3549; AVX512VL-NEXT:    vmovaps (%rdi), %xmm0
3550; AVX512VL-NEXT:    vcvtudq2pd %xmm0, %xmm0
3551; AVX512VL-NEXT:    retq
3552;
3553; AVX512DQ-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
3554; AVX512DQ:       # %bb.0:
3555; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
3556; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
3557; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3558; AVX512DQ-NEXT:    vzeroupper
3559; AVX512DQ-NEXT:    retq
3560;
3561; AVX512VLDQ-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
3562; AVX512VLDQ:       # %bb.0:
3563; AVX512VLDQ-NEXT:    vmovaps (%rdi), %xmm0
3564; AVX512VLDQ-NEXT:    vcvtudq2pd %xmm0, %xmm0
3565; AVX512VLDQ-NEXT:    retq
3566  %a = load volatile <4 x i32>, <4 x i32>* %x
3567  %b = uitofp <4 x i32> %a to <4 x double>
3568  %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> <i32 0, i32 1>
3569  ret <2 x double> %c
3570}
3571
3572define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) {
3573; SSE2-LABEL: uitofp_load_2i16_to_2f64:
3574; SSE2:       # %bb.0:
3575; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3576; SSE2-NEXT:    pxor %xmm1, %xmm1
3577; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3578; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
3579; SSE2-NEXT:    retq
3580;
3581; SSE41-LABEL: uitofp_load_2i16_to_2f64:
3582; SSE41:       # %bb.0:
3583; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3584; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3585; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
3586; SSE41-NEXT:    retq
3587;
3588; AVX-LABEL: uitofp_load_2i16_to_2f64:
3589; AVX:       # %bb.0:
3590; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3591; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3592; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
3593; AVX-NEXT:    retq
3594  %ld = load <2 x i16>, <2 x i16> *%a
3595  %cvt = uitofp <2 x i16> %ld to <2 x double>
3596  ret <2 x double> %cvt
3597}
3598
3599define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) {
3600; SSE2-LABEL: uitofp_load_2i8_to_2f64:
3601; SSE2:       # %bb.0:
3602; SSE2-NEXT:    movzwl (%rdi), %eax
3603; SSE2-NEXT:    movd %eax, %xmm0
3604; SSE2-NEXT:    pxor %xmm1, %xmm1
3605; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3606; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3607; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
3608; SSE2-NEXT:    retq
3609;
3610; SSE41-LABEL: uitofp_load_2i8_to_2f64:
3611; SSE41:       # %bb.0:
3612; SSE41-NEXT:    movzwl (%rdi), %eax
3613; SSE41-NEXT:    movd %eax, %xmm0
3614; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
3615; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
3616; SSE41-NEXT:    retq
3617;
3618; AVX-LABEL: uitofp_load_2i8_to_2f64:
3619; AVX:       # %bb.0:
3620; AVX-NEXT:    movzwl (%rdi), %eax
3621; AVX-NEXT:    vmovd %eax, %xmm0
3622; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
3623; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
3624; AVX-NEXT:    retq
3625  %ld = load <2 x i8>, <2 x i8> *%a
3626  %cvt = uitofp <2 x i8> %ld to <2 x double>
3627  ret <2 x double> %cvt
3628}
3629
3630define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
3631; SSE2-LABEL: uitofp_load_4i64_to_4f64:
3632; SSE2:       # %bb.0:
3633; SSE2-NEXT:    movdqa (%rdi), %xmm0
3634; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
3635; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
3636; SSE2-NEXT:    movdqa %xmm0, %xmm3
3637; SSE2-NEXT:    pand %xmm2, %xmm3
3638; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
3639; SSE2-NEXT:    por %xmm4, %xmm3
3640; SSE2-NEXT:    psrlq $32, %xmm0
3641; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
3642; SSE2-NEXT:    por %xmm5, %xmm0
3643; SSE2-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
3644; SSE2-NEXT:    subpd %xmm6, %xmm0
3645; SSE2-NEXT:    addpd %xmm3, %xmm0
3646; SSE2-NEXT:    pand %xmm1, %xmm2
3647; SSE2-NEXT:    por %xmm4, %xmm2
3648; SSE2-NEXT:    psrlq $32, %xmm1
3649; SSE2-NEXT:    por %xmm5, %xmm1
3650; SSE2-NEXT:    subpd %xmm6, %xmm1
3651; SSE2-NEXT:    addpd %xmm2, %xmm1
3652; SSE2-NEXT:    retq
3653;
3654; SSE41-LABEL: uitofp_load_4i64_to_4f64:
3655; SSE41:       # %bb.0:
3656; SSE41-NEXT:    movdqa (%rdi), %xmm0
3657; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
3658; SSE41-NEXT:    pxor %xmm2, %xmm2
3659; SSE41-NEXT:    movdqa %xmm0, %xmm3
3660; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
3661; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
3662; SSE41-NEXT:    por %xmm4, %xmm3
3663; SSE41-NEXT:    psrlq $32, %xmm0
3664; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
3665; SSE41-NEXT:    por %xmm5, %xmm0
3666; SSE41-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
3667; SSE41-NEXT:    subpd %xmm6, %xmm0
3668; SSE41-NEXT:    addpd %xmm3, %xmm0
3669; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
3670; SSE41-NEXT:    por %xmm4, %xmm2
3671; SSE41-NEXT:    psrlq $32, %xmm1
3672; SSE41-NEXT:    por %xmm5, %xmm1
3673; SSE41-NEXT:    subpd %xmm6, %xmm1
3674; SSE41-NEXT:    addpd %xmm2, %xmm1
3675; SSE41-NEXT:    retq
3676;
3677; AVX1-LABEL: uitofp_load_4i64_to_4f64:
3678; AVX1:       # %bb.0:
3679; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
3680; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7]
3681; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm0, %ymm0
3682; AVX1-NEXT:    vmovdqa (%rdi), %xmm1
3683; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm2
3684; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
3685; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm2
3686; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
3687; AVX1-NEXT:    vorpd {{.*}}(%rip), %ymm1, %ymm1
3688; AVX1-NEXT:    vsubpd {{.*}}(%rip), %ymm1, %ymm1
3689; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
3690; AVX1-NEXT:    retq
3691;
3692; AVX2-LABEL: uitofp_load_4i64_to_4f64:
3693; AVX2:       # %bb.0:
3694; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
3695; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3696; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
3697; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
3698; AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
3699; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
3700; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
3701; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
3702; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
3703; AVX2-NEXT:    vsubpd %ymm2, %ymm0, %ymm0
3704; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
3705; AVX2-NEXT:    retq
3706;
3707; AVX512F-LABEL: uitofp_load_4i64_to_4f64:
3708; AVX512F:       # %bb.0:
3709; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
3710; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3711; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
3712; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
3713; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
3714; AVX512F-NEXT:    vpsrlq $32, %ymm0, %ymm0
3715; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
3716; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
3717; AVX512F-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
3718; AVX512F-NEXT:    vsubpd %ymm2, %ymm0, %ymm0
3719; AVX512F-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
3720; AVX512F-NEXT:    retq
3721;
3722; AVX512VL-LABEL: uitofp_load_4i64_to_4f64:
3723; AVX512VL:       # %bb.0:
3724; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
3725; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3726; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
3727; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1
3728; AVX512VL-NEXT:    vpsrlq $32, %ymm0, %ymm0
3729; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0
3730; AVX512VL-NEXT:    vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
3731; AVX512VL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
3732; AVX512VL-NEXT:    retq
3733;
3734; AVX512DQ-LABEL: uitofp_load_4i64_to_4f64:
3735; AVX512DQ:       # %bb.0:
3736; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
3737; AVX512DQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
3738; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3739; AVX512DQ-NEXT:    retq
3740;
3741; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f64:
3742; AVX512VLDQ:       # %bb.0:
3743; AVX512VLDQ-NEXT:    vcvtuqq2pd (%rdi), %ymm0
3744; AVX512VLDQ-NEXT:    retq
3745  %ld = load <4 x i64>, <4 x i64> *%a
3746  %cvt = uitofp <4 x i64> %ld to <4 x double>
3747  ret <4 x double> %cvt
3748}
3749
3750define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
3751; SSE2-LABEL: uitofp_load_4i32_to_4f64:
3752; SSE2:       # %bb.0:
3753; SSE2-NEXT:    movapd (%rdi), %xmm1
3754; SSE2-NEXT:    xorpd %xmm2, %xmm2
3755; SSE2-NEXT:    movapd %xmm1, %xmm0
3756; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3757; SSE2-NEXT:    movapd {{.*#+}} xmm3 = [4.503599627370496E+15,4.503599627370496E+15]
3758; SSE2-NEXT:    orpd %xmm3, %xmm0
3759; SSE2-NEXT:    subpd %xmm3, %xmm0
3760; SSE2-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3761; SSE2-NEXT:    orpd %xmm3, %xmm1
3762; SSE2-NEXT:    subpd %xmm3, %xmm1
3763; SSE2-NEXT:    retq
3764;
3765; SSE41-LABEL: uitofp_load_4i32_to_4f64:
3766; SSE41:       # %bb.0:
3767; SSE41-NEXT:    movdqa (%rdi), %xmm1
3768; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
3769; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15]
3770; SSE41-NEXT:    por %xmm2, %xmm0
3771; SSE41-NEXT:    subpd %xmm2, %xmm0
3772; SSE41-NEXT:    pxor %xmm3, %xmm3
3773; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
3774; SSE41-NEXT:    por %xmm2, %xmm1
3775; SSE41-NEXT:    subpd %xmm2, %xmm1
3776; SSE41-NEXT:    retq
3777;
3778; AVX1-LABEL: uitofp_load_4i32_to_4f64:
3779; AVX1:       # %bb.0:
3780; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
3781; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3782; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3783; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
3784; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3785; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15]
3786; AVX1-NEXT:    vorpd %ymm1, %ymm0, %ymm0
3787; AVX1-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
3788; AVX1-NEXT:    retq
3789;
3790; AVX2-LABEL: uitofp_load_4i32_to_4f64:
3791; AVX2:       # %bb.0:
3792; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
3793; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15]
3794; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
3795; AVX2-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
3796; AVX2-NEXT:    retq
3797;
3798; AVX512F-LABEL: uitofp_load_4i32_to_4f64:
3799; AVX512F:       # %bb.0:
3800; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
3801; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
3802; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3803; AVX512F-NEXT:    retq
3804;
3805; AVX512VL-LABEL: uitofp_load_4i32_to_4f64:
3806; AVX512VL:       # %bb.0:
3807; AVX512VL-NEXT:    vcvtudq2pd (%rdi), %ymm0
3808; AVX512VL-NEXT:    retq
3809;
3810; AVX512DQ-LABEL: uitofp_load_4i32_to_4f64:
3811; AVX512DQ:       # %bb.0:
3812; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
3813; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
3814; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3815; AVX512DQ-NEXT:    retq
3816;
3817; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f64:
3818; AVX512VLDQ:       # %bb.0:
3819; AVX512VLDQ-NEXT:    vcvtudq2pd (%rdi), %ymm0
3820; AVX512VLDQ-NEXT:    retq
3821  %ld = load <4 x i32>, <4 x i32> *%a
3822  %cvt = uitofp <4 x i32> %ld to <4 x double>
3823  ret <4 x double> %cvt
3824}
3825
3826define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) {
3827; SSE2-LABEL: uitofp_load_4i16_to_4f64:
3828; SSE2:       # %bb.0:
3829; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
3830; SSE2-NEXT:    pxor %xmm0, %xmm0
3831; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3832; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
3833; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
3834; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
3835; SSE2-NEXT:    retq
3836;
3837; SSE41-LABEL: uitofp_load_4i16_to_4f64:
3838; SSE41:       # %bb.0:
3839; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
3840; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
3841; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
3842; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
3843; SSE41-NEXT:    retq
3844;
3845; AVX-LABEL: uitofp_load_4i16_to_4f64:
3846; AVX:       # %bb.0:
3847; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
3848; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
3849; AVX-NEXT:    retq
3850  %ld = load <4 x i16>, <4 x i16> *%a
3851  %cvt = uitofp <4 x i16> %ld to <4 x double>
3852  ret <4 x double> %cvt
3853}
3854
3855define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) {
3856; SSE2-LABEL: uitofp_load_4i8_to_4f64:
3857; SSE2:       # %bb.0:
3858; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
3859; SSE2-NEXT:    pxor %xmm0, %xmm0
3860; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3861; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3862; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
3863; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
3864; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
3865; SSE2-NEXT:    retq
3866;
3867; SSE41-LABEL: uitofp_load_4i8_to_4f64:
3868; SSE41:       # %bb.0:
3869; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
3870; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
3871; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
3872; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
3873; SSE41-NEXT:    retq
3874;
3875; AVX-LABEL: uitofp_load_4i8_to_4f64:
3876; AVX:       # %bb.0:
3877; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
3878; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
3879; AVX-NEXT:    retq
3880  %ld = load <4 x i8>, <4 x i8> *%a
3881  %cvt = uitofp <4 x i8> %ld to <4 x double>
3882  ret <4 x double> %cvt
3883}
3884
3885;
3886; Load Signed Integer to Float
3887;
3888
3889define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
3890; SSE2-LABEL: sitofp_load_4i64_to_4f32:
3891; SSE2:       # %bb.0:
3892; SSE2-NEXT:    movdqa (%rdi), %xmm1
3893; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
3894; SSE2-NEXT:    movq %xmm0, %rax
3895; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
3896; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3897; SSE2-NEXT:    movq %xmm0, %rax
3898; SSE2-NEXT:    xorps %xmm0, %xmm0
3899; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
3900; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3901; SSE2-NEXT:    movq %xmm1, %rax
3902; SSE2-NEXT:    xorps %xmm0, %xmm0
3903; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
3904; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
3905; SSE2-NEXT:    movq %xmm1, %rax
3906; SSE2-NEXT:    xorps %xmm1, %xmm1
3907; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
3908; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3909; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3910; SSE2-NEXT:    retq
3911;
3912; SSE41-LABEL: sitofp_load_4i64_to_4f32:
3913; SSE41:       # %bb.0:
3914; SSE41-NEXT:    movdqa (%rdi), %xmm0
3915; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
3916; SSE41-NEXT:    pextrq $1, %xmm0, %rax
3917; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
3918; SSE41-NEXT:    movq %xmm0, %rax
3919; SSE41-NEXT:    xorps %xmm0, %xmm0
3920; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
3921; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
3922; SSE41-NEXT:    movq %xmm1, %rax
3923; SSE41-NEXT:    xorps %xmm2, %xmm2
3924; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
3925; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
3926; SSE41-NEXT:    pextrq $1, %xmm1, %rax
3927; SSE41-NEXT:    xorps %xmm1, %xmm1
3928; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
3929; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
3930; SSE41-NEXT:    retq
3931;
3932; VEX-LABEL: sitofp_load_4i64_to_4f32:
3933; VEX:       # %bb.0:
3934; VEX-NEXT:    vmovdqa (%rdi), %xmm0
3935; VEX-NEXT:    vmovdqa 16(%rdi), %xmm1
3936; VEX-NEXT:    vpextrq $1, %xmm0, %rax
3937; VEX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
3938; VEX-NEXT:    vmovq %xmm0, %rax
3939; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
3940; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
3941; VEX-NEXT:    vmovq %xmm1, %rax
3942; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
3943; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
3944; VEX-NEXT:    vpextrq $1, %xmm1, %rax
3945; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
3946; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
3947; VEX-NEXT:    retq
3948;
3949; AVX512F-LABEL: sitofp_load_4i64_to_4f32:
3950; AVX512F:       # %bb.0:
3951; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
3952; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
3953; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
3954; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
3955; AVX512F-NEXT:    vmovq %xmm0, %rax
3956; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
3957; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
3958; AVX512F-NEXT:    vmovq %xmm1, %rax
3959; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
3960; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
3961; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
3962; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
3963; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
3964; AVX512F-NEXT:    retq
3965;
3966; AVX512VL-LABEL: sitofp_load_4i64_to_4f32:
3967; AVX512VL:       # %bb.0:
3968; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
3969; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
3970; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
3971; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
3972; AVX512VL-NEXT:    vmovq %xmm0, %rax
3973; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
3974; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
3975; AVX512VL-NEXT:    vmovq %xmm1, %rax
3976; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
3977; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
3978; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
3979; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
3980; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
3981; AVX512VL-NEXT:    retq
3982;
3983; AVX512DQ-LABEL: sitofp_load_4i64_to_4f32:
3984; AVX512DQ:       # %bb.0:
3985; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
3986; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
3987; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3988; AVX512DQ-NEXT:    vzeroupper
3989; AVX512DQ-NEXT:    retq
3990;
3991; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f32:
3992; AVX512VLDQ:       # %bb.0:
3993; AVX512VLDQ-NEXT:    vcvtqq2psy (%rdi), %xmm0
3994; AVX512VLDQ-NEXT:    retq
3995  %ld = load <4 x i64>, <4 x i64> *%a
3996  %cvt = sitofp <4 x i64> %ld to <4 x float>
3997  ret <4 x float> %cvt
3998}
3999
4000define <4 x float> @sitofp_load_4i32_to_4f32(<4 x i32> *%a) {
4001; SSE-LABEL: sitofp_load_4i32_to_4f32:
4002; SSE:       # %bb.0:
4003; SSE-NEXT:    cvtdq2ps (%rdi), %xmm0
4004; SSE-NEXT:    retq
4005;
4006; AVX-LABEL: sitofp_load_4i32_to_4f32:
4007; AVX:       # %bb.0:
4008; AVX-NEXT:    vcvtdq2ps (%rdi), %xmm0
4009; AVX-NEXT:    retq
4010  %ld = load <4 x i32>, <4 x i32> *%a
4011  %cvt = sitofp <4 x i32> %ld to <4 x float>
4012  ret <4 x float> %cvt
4013}
4014
4015define <4 x float> @sitofp_load_4i16_to_4f32(<4 x i16> *%a) {
4016; SSE2-LABEL: sitofp_load_4i16_to_4f32:
4017; SSE2:       # %bb.0:
4018; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4019; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
4020; SSE2-NEXT:    psrad $16, %xmm0
4021; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
4022; SSE2-NEXT:    retq
4023;
4024; SSE41-LABEL: sitofp_load_4i16_to_4f32:
4025; SSE41:       # %bb.0:
4026; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
4027; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
4028; SSE41-NEXT:    retq
4029;
4030; AVX-LABEL: sitofp_load_4i16_to_4f32:
4031; AVX:       # %bb.0:
4032; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
4033; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
4034; AVX-NEXT:    retq
4035  %ld = load <4 x i16>, <4 x i16> *%a
4036  %cvt = sitofp <4 x i16> %ld to <4 x float>
4037  ret <4 x float> %cvt
4038}
4039
4040define <4 x float> @sitofp_load_4i8_to_4f32(<4 x i8> *%a) {
4041; SSE2-LABEL: sitofp_load_4i8_to_4f32:
4042; SSE2:       # %bb.0:
4043; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
4044; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
4045; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
4046; SSE2-NEXT:    psrad $24, %xmm0
4047; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
4048; SSE2-NEXT:    retq
4049;
4050; SSE41-LABEL: sitofp_load_4i8_to_4f32:
4051; SSE41:       # %bb.0:
4052; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
4053; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
4054; SSE41-NEXT:    retq
4055;
4056; AVX-LABEL: sitofp_load_4i8_to_4f32:
4057; AVX:       # %bb.0:
4058; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
4059; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
4060; AVX-NEXT:    retq
4061  %ld = load <4 x i8>, <4 x i8> *%a
4062  %cvt = sitofp <4 x i8> %ld to <4 x float>
4063  ret <4 x float> %cvt
4064}
4065
4066define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
4067; SSE2-LABEL: sitofp_load_8i64_to_8f32:
4068; SSE2:       # %bb.0:
4069; SSE2-NEXT:    movdqa (%rdi), %xmm1
4070; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
4071; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
4072; SSE2-NEXT:    movdqa 48(%rdi), %xmm3
4073; SSE2-NEXT:    movq %xmm0, %rax
4074; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
4075; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4076; SSE2-NEXT:    movq %xmm0, %rax
4077; SSE2-NEXT:    xorps %xmm0, %xmm0
4078; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
4079; SSE2-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
4080; SSE2-NEXT:    movq %xmm1, %rax
4081; SSE2-NEXT:    xorps %xmm0, %xmm0
4082; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
4083; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
4084; SSE2-NEXT:    movq %xmm1, %rax
4085; SSE2-NEXT:    xorps %xmm1, %xmm1
4086; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
4087; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4088; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
4089; SSE2-NEXT:    movq %xmm3, %rax
4090; SSE2-NEXT:    xorps %xmm4, %xmm4
4091; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
4092; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
4093; SSE2-NEXT:    movq %xmm1, %rax
4094; SSE2-NEXT:    xorps %xmm1, %xmm1
4095; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
4096; SSE2-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4097; SSE2-NEXT:    movq %xmm2, %rax
4098; SSE2-NEXT:    xorps %xmm1, %xmm1
4099; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
4100; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
4101; SSE2-NEXT:    movq %xmm2, %rax
4102; SSE2-NEXT:    xorps %xmm2, %xmm2
4103; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
4104; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4105; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
4106; SSE2-NEXT:    retq
4107;
4108; SSE41-LABEL: sitofp_load_8i64_to_8f32:
4109; SSE41:       # %bb.0:
4110; SSE41-NEXT:    movdqa (%rdi), %xmm0
4111; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
4112; SSE41-NEXT:    movdqa 32(%rdi), %xmm2
4113; SSE41-NEXT:    movdqa 48(%rdi), %xmm3
4114; SSE41-NEXT:    pextrq $1, %xmm0, %rax
4115; SSE41-NEXT:    cvtsi2ss %rax, %xmm4
4116; SSE41-NEXT:    movq %xmm0, %rax
4117; SSE41-NEXT:    xorps %xmm0, %xmm0
4118; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
4119; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
4120; SSE41-NEXT:    movq %xmm1, %rax
4121; SSE41-NEXT:    xorps %xmm4, %xmm4
4122; SSE41-NEXT:    cvtsi2ss %rax, %xmm4
4123; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0],xmm0[3]
4124; SSE41-NEXT:    pextrq $1, %xmm1, %rax
4125; SSE41-NEXT:    xorps %xmm1, %xmm1
4126; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
4127; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
4128; SSE41-NEXT:    pextrq $1, %xmm2, %rax
4129; SSE41-NEXT:    xorps %xmm4, %xmm4
4130; SSE41-NEXT:    cvtsi2ss %rax, %xmm4
4131; SSE41-NEXT:    movq %xmm2, %rax
4132; SSE41-NEXT:    xorps %xmm1, %xmm1
4133; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
4134; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3]
4135; SSE41-NEXT:    movq %xmm3, %rax
4136; SSE41-NEXT:    xorps %xmm2, %xmm2
4137; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
4138; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
4139; SSE41-NEXT:    pextrq $1, %xmm3, %rax
4140; SSE41-NEXT:    xorps %xmm2, %xmm2
4141; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
4142; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
4143; SSE41-NEXT:    retq
4144;
4145; VEX-LABEL: sitofp_load_8i64_to_8f32:
4146; VEX:       # %bb.0:
4147; VEX-NEXT:    vmovaps (%rdi), %xmm0
4148; VEX-NEXT:    vmovdqa 16(%rdi), %xmm1
4149; VEX-NEXT:    vmovdqa 32(%rdi), %xmm2
4150; VEX-NEXT:    vmovdqa 48(%rdi), %xmm3
4151; VEX-NEXT:    vpextrq $1, %xmm2, %rax
4152; VEX-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm4
4153; VEX-NEXT:    vmovq %xmm2, %rax
4154; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm2
4155; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
4156; VEX-NEXT:    vmovq %xmm3, %rax
4157; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
4158; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
4159; VEX-NEXT:    vpextrq $1, %xmm3, %rax
4160; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
4161; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
4162; VEX-NEXT:    vpextrq $1, %xmm0, %rax
4163; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
4164; VEX-NEXT:    vmovq %xmm0, %rax
4165; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm0
4166; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
4167; VEX-NEXT:    vmovq %xmm1, %rax
4168; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
4169; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
4170; VEX-NEXT:    vpextrq $1, %xmm1, %rax
4171; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm1
4172; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
4173; VEX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
4174; VEX-NEXT:    retq
4175;
4176; AVX512F-LABEL: sitofp_load_8i64_to_8f32:
4177; AVX512F:       # %bb.0:
4178; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
4179; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
4180; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
4181; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
4182; AVX512F-NEXT:    vpextrq $1, %xmm2, %rax
4183; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm4
4184; AVX512F-NEXT:    vmovq %xmm2, %rax
4185; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm2
4186; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
4187; AVX512F-NEXT:    vmovq %xmm3, %rax
4188; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
4189; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
4190; AVX512F-NEXT:    vpextrq $1, %xmm3, %rax
4191; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
4192; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
4193; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
4194; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
4195; AVX512F-NEXT:    vmovq %xmm0, %rax
4196; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm0
4197; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
4198; AVX512F-NEXT:    vmovq %xmm1, %rax
4199; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
4200; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
4201; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
4202; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm1
4203; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
4204; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
4205; AVX512F-NEXT:    retq
4206;
4207; AVX512VL-LABEL: sitofp_load_8i64_to_8f32:
4208; AVX512VL:       # %bb.0:
4209; AVX512VL-NEXT:    vmovaps (%rdi), %xmm0
4210; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
4211; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
4212; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
4213; AVX512VL-NEXT:    vpextrq $1, %xmm2, %rax
4214; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm4
4215; AVX512VL-NEXT:    vmovq %xmm2, %rax
4216; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm2
4217; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
4218; AVX512VL-NEXT:    vmovq %xmm3, %rax
4219; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
4220; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
4221; AVX512VL-NEXT:    vpextrq $1, %xmm3, %rax
4222; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
4223; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
4224; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
4225; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
4226; AVX512VL-NEXT:    vmovq %xmm0, %rax
4227; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm0
4228; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
4229; AVX512VL-NEXT:    vmovq %xmm1, %rax
4230; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
4231; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
4232; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
4233; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm1
4234; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
4235; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
4236; AVX512VL-NEXT:    retq
4237;
4238; AVX512DQ-LABEL: sitofp_load_8i64_to_8f32:
4239; AVX512DQ:       # %bb.0:
4240; AVX512DQ-NEXT:    vcvtqq2ps (%rdi), %ymm0
4241; AVX512DQ-NEXT:    retq
4242;
4243; AVX512VLDQ-LABEL: sitofp_load_8i64_to_8f32:
4244; AVX512VLDQ:       # %bb.0:
4245; AVX512VLDQ-NEXT:    vcvtqq2ps (%rdi), %ymm0
4246; AVX512VLDQ-NEXT:    retq
4247  %ld = load <8 x i64>, <8 x i64> *%a
4248  %cvt = sitofp <8 x i64> %ld to <8 x float>
4249  ret <8 x float> %cvt
4250}
4251
4252define <8 x float> @sitofp_load_8i32_to_8f32(<8 x i32> *%a) {
4253; SSE-LABEL: sitofp_load_8i32_to_8f32:
4254; SSE:       # %bb.0:
4255; SSE-NEXT:    cvtdq2ps (%rdi), %xmm0
4256; SSE-NEXT:    cvtdq2ps 16(%rdi), %xmm1
4257; SSE-NEXT:    retq
4258;
4259; AVX-LABEL: sitofp_load_8i32_to_8f32:
4260; AVX:       # %bb.0:
4261; AVX-NEXT:    vcvtdq2ps (%rdi), %ymm0
4262; AVX-NEXT:    retq
4263  %ld = load <8 x i32>, <8 x i32> *%a
4264  %cvt = sitofp <8 x i32> %ld to <8 x float>
4265  ret <8 x float> %cvt
4266}
4267
4268define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) {
4269; SSE2-LABEL: sitofp_load_8i16_to_8f32:
4270; SSE2:       # %bb.0:
4271; SSE2-NEXT:    movdqa (%rdi), %xmm1
4272; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4273; SSE2-NEXT:    psrad $16, %xmm0
4274; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
4275; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
4276; SSE2-NEXT:    psrad $16, %xmm1
4277; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm1
4278; SSE2-NEXT:    retq
4279;
4280; SSE41-LABEL: sitofp_load_8i16_to_8f32:
4281; SSE41:       # %bb.0:
4282; SSE41-NEXT:    pmovsxwd 8(%rdi), %xmm1
4283; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
4284; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
4285; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm1
4286; SSE41-NEXT:    retq
4287;
4288; AVX1-LABEL: sitofp_load_8i16_to_8f32:
4289; AVX1:       # %bb.0:
4290; AVX1-NEXT:    vpmovsxwd 8(%rdi), %xmm0
4291; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm1
4292; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
4293; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
4294; AVX1-NEXT:    retq
4295;
4296; AVX2-LABEL: sitofp_load_8i16_to_8f32:
4297; AVX2:       # %bb.0:
4298; AVX2-NEXT:    vpmovsxwd (%rdi), %ymm0
4299; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
4300; AVX2-NEXT:    retq
4301;
4302; AVX512-LABEL: sitofp_load_8i16_to_8f32:
4303; AVX512:       # %bb.0:
4304; AVX512-NEXT:    vpmovsxwd (%rdi), %ymm0
4305; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
4306; AVX512-NEXT:    retq
4307  %ld = load <8 x i16>, <8 x i16> *%a
4308  %cvt = sitofp <8 x i16> %ld to <8 x float>
4309  ret <8 x float> %cvt
4310}
4311
4312define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) {
4313; SSE2-LABEL: sitofp_load_8i8_to_8f32:
4314; SSE2:       # %bb.0:
4315; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4316; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
4317; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4318; SSE2-NEXT:    psrad $24, %xmm0
4319; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
4320; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
4321; SSE2-NEXT:    psrad $24, %xmm1
4322; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm1
4323; SSE2-NEXT:    retq
4324;
4325; SSE41-LABEL: sitofp_load_8i8_to_8f32:
4326; SSE41:       # %bb.0:
4327; SSE41-NEXT:    pmovsxbd 4(%rdi), %xmm1
4328; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
4329; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
4330; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm1
4331; SSE41-NEXT:    retq
4332;
4333; AVX1-LABEL: sitofp_load_8i8_to_8f32:
4334; AVX1:       # %bb.0:
4335; AVX1-NEXT:    vpmovsxbd 4(%rdi), %xmm0
4336; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm1
4337; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
4338; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
4339; AVX1-NEXT:    retq
4340;
4341; AVX2-LABEL: sitofp_load_8i8_to_8f32:
4342; AVX2:       # %bb.0:
4343; AVX2-NEXT:    vpmovsxbd (%rdi), %ymm0
4344; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
4345; AVX2-NEXT:    retq
4346;
4347; AVX512-LABEL: sitofp_load_8i8_to_8f32:
4348; AVX512:       # %bb.0:
4349; AVX512-NEXT:    vpmovsxbd (%rdi), %ymm0
4350; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
4351; AVX512-NEXT:    retq
4352  %ld = load <8 x i8>, <8 x i8> *%a
4353  %cvt = sitofp <8 x i8> %ld to <8 x float>
4354  ret <8 x float> %cvt
4355}
4356
4357;
4358; Load Unsigned Integer to Float
4359;
4360
4361define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
4362; SSE2-LABEL: uitofp_load_4i64_to_4f32:
4363; SSE2:       # %bb.0:
4364; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
4365; SSE2-NEXT:    movq %xmm0, %rax
4366; SSE2-NEXT:    testq %rax, %rax
4367; SSE2-NEXT:    js .LBB83_1
4368; SSE2-NEXT:  # %bb.2:
4369; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
4370; SSE2-NEXT:    jmp .LBB83_3
4371; SSE2-NEXT:  .LBB83_1:
4372; SSE2-NEXT:    movq %rax, %rcx
4373; SSE2-NEXT:    shrq %rcx
4374; SSE2-NEXT:    andl $1, %eax
4375; SSE2-NEXT:    orq %rcx, %rax
4376; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
4377; SSE2-NEXT:    addss %xmm1, %xmm1
4378; SSE2-NEXT:  .LBB83_3:
4379; SSE2-NEXT:    movdqa (%rdi), %xmm2
4380; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4381; SSE2-NEXT:    movq %xmm0, %rax
4382; SSE2-NEXT:    testq %rax, %rax
4383; SSE2-NEXT:    js .LBB83_4
4384; SSE2-NEXT:  # %bb.5:
4385; SSE2-NEXT:    cvtsi2ss %rax, %xmm3
4386; SSE2-NEXT:    jmp .LBB83_6
4387; SSE2-NEXT:  .LBB83_4:
4388; SSE2-NEXT:    movq %rax, %rcx
4389; SSE2-NEXT:    shrq %rcx
4390; SSE2-NEXT:    andl $1, %eax
4391; SSE2-NEXT:    orq %rcx, %rax
4392; SSE2-NEXT:    cvtsi2ss %rax, %xmm3
4393; SSE2-NEXT:    addss %xmm3, %xmm3
4394; SSE2-NEXT:  .LBB83_6:
4395; SSE2-NEXT:    movq %xmm2, %rax
4396; SSE2-NEXT:    testq %rax, %rax
4397; SSE2-NEXT:    js .LBB83_7
4398; SSE2-NEXT:  # %bb.8:
4399; SSE2-NEXT:    xorps %xmm0, %xmm0
4400; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
4401; SSE2-NEXT:    jmp .LBB83_9
4402; SSE2-NEXT:  .LBB83_7:
4403; SSE2-NEXT:    movq %rax, %rcx
4404; SSE2-NEXT:    shrq %rcx
4405; SSE2-NEXT:    andl $1, %eax
4406; SSE2-NEXT:    orq %rcx, %rax
4407; SSE2-NEXT:    xorps %xmm0, %xmm0
4408; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
4409; SSE2-NEXT:    addss %xmm0, %xmm0
4410; SSE2-NEXT:  .LBB83_9:
4411; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
4412; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
4413; SSE2-NEXT:    movq %xmm2, %rax
4414; SSE2-NEXT:    testq %rax, %rax
4415; SSE2-NEXT:    js .LBB83_10
4416; SSE2-NEXT:  # %bb.11:
4417; SSE2-NEXT:    xorps %xmm2, %xmm2
4418; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
4419; SSE2-NEXT:    jmp .LBB83_12
4420; SSE2-NEXT:  .LBB83_10:
4421; SSE2-NEXT:    movq %rax, %rcx
4422; SSE2-NEXT:    shrq %rcx
4423; SSE2-NEXT:    andl $1, %eax
4424; SSE2-NEXT:    orq %rcx, %rax
4425; SSE2-NEXT:    xorps %xmm2, %xmm2
4426; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
4427; SSE2-NEXT:    addss %xmm2, %xmm2
4428; SSE2-NEXT:  .LBB83_12:
4429; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4430; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4431; SSE2-NEXT:    retq
4432;
4433; SSE41-LABEL: uitofp_load_4i64_to_4f32:
4434; SSE41:       # %bb.0:
4435; SSE41-NEXT:    movdqa (%rdi), %xmm1
4436; SSE41-NEXT:    movdqa 16(%rdi), %xmm2
4437; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [1,1]
4438; SSE41-NEXT:    movdqa %xmm1, %xmm0
4439; SSE41-NEXT:    pand %xmm4, %xmm0
4440; SSE41-NEXT:    movdqa %xmm1, %xmm3
4441; SSE41-NEXT:    psrlq $1, %xmm3
4442; SSE41-NEXT:    por %xmm0, %xmm3
4443; SSE41-NEXT:    movdqa %xmm1, %xmm5
4444; SSE41-NEXT:    movdqa %xmm1, %xmm0
4445; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm5
4446; SSE41-NEXT:    pextrq $1, %xmm5, %rax
4447; SSE41-NEXT:    xorps %xmm0, %xmm0
4448; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
4449; SSE41-NEXT:    movq %xmm5, %rax
4450; SSE41-NEXT:    xorps %xmm3, %xmm3
4451; SSE41-NEXT:    cvtsi2ss %rax, %xmm3
4452; SSE41-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[2,3]
4453; SSE41-NEXT:    pand %xmm2, %xmm4
4454; SSE41-NEXT:    movdqa %xmm2, %xmm5
4455; SSE41-NEXT:    psrlq $1, %xmm5
4456; SSE41-NEXT:    por %xmm4, %xmm5
4457; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
4458; SSE41-NEXT:    movaps %xmm2, %xmm0
4459; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm2
4460; SSE41-NEXT:    movq %xmm2, %rax
4461; SSE41-NEXT:    xorps %xmm0, %xmm0
4462; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
4463; SSE41-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0],xmm3[3]
4464; SSE41-NEXT:    pextrq $1, %xmm2, %rax
4465; SSE41-NEXT:    xorps %xmm0, %xmm0
4466; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
4467; SSE41-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm0[0]
4468; SSE41-NEXT:    movaps %xmm3, %xmm2
4469; SSE41-NEXT:    addps %xmm3, %xmm2
4470; SSE41-NEXT:    movaps %xmm1, %xmm0
4471; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm3
4472; SSE41-NEXT:    movaps %xmm3, %xmm0
4473; SSE41-NEXT:    retq
4474;
4475; AVX1-LABEL: uitofp_load_4i64_to_4f32:
4476; AVX1:       # %bb.0:
4477; AVX1-NEXT:    vmovapd (%rdi), %ymm0
4478; AVX1-NEXT:    vmovdqa (%rdi), %xmm1
4479; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm2
4480; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm3
4481; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm4
4482; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
4483; AVX1-NEXT:    vandpd {{.*}}(%rip), %ymm0, %ymm4
4484; AVX1-NEXT:    vorpd %ymm4, %ymm3, %ymm3
4485; AVX1-NEXT:    vblendvpd %ymm0, %ymm3, %ymm0, %ymm0
4486; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
4487; AVX1-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
4488; AVX1-NEXT:    vmovq %xmm0, %rax
4489; AVX1-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
4490; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
4491; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4492; AVX1-NEXT:    vmovq %xmm0, %rax
4493; AVX1-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
4494; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
4495; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
4496; AVX1-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm0
4497; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
4498; AVX1-NEXT:    vaddps %xmm0, %xmm0, %xmm3
4499; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
4500; AVX1-NEXT:    vblendvps %xmm1, %xmm3, %xmm0, %xmm0
4501; AVX1-NEXT:    vzeroupper
4502; AVX1-NEXT:    retq
4503;
4504; AVX2-LABEL: uitofp_load_4i64_to_4f32:
4505; AVX2:       # %bb.0:
4506; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
4507; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
4508; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm1
4509; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm2
4510; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
4511; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
4512; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
4513; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
4514; AVX2-NEXT:    vmovq %xmm0, %rax
4515; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
4516; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
4517; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
4518; AVX2-NEXT:    vmovq %xmm0, %rax
4519; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
4520; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
4521; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
4522; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
4523; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
4524; AVX2-NEXT:    vaddps %xmm0, %xmm0, %xmm1
4525; AVX2-NEXT:    vmovdqa (%rdi), %xmm2
4526; AVX2-NEXT:    vpackssdw 16(%rdi), %xmm2, %xmm2
4527; AVX2-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
4528; AVX2-NEXT:    vzeroupper
4529; AVX2-NEXT:    retq
4530;
4531; AVX512F-LABEL: uitofp_load_4i64_to_4f32:
4532; AVX512F:       # %bb.0:
4533; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
4534; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
4535; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
4536; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
4537; AVX512F-NEXT:    vmovq %xmm0, %rax
4538; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm0
4539; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
4540; AVX512F-NEXT:    vmovq %xmm1, %rax
4541; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm2
4542; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
4543; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
4544; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm1
4545; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
4546; AVX512F-NEXT:    retq
4547;
4548; AVX512VL-LABEL: uitofp_load_4i64_to_4f32:
4549; AVX512VL:       # %bb.0:
4550; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
4551; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
4552; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
4553; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
4554; AVX512VL-NEXT:    vmovq %xmm0, %rax
4555; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm0
4556; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
4557; AVX512VL-NEXT:    vmovq %xmm1, %rax
4558; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm2
4559; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
4560; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
4561; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm1
4562; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
4563; AVX512VL-NEXT:    retq
4564;
4565; AVX512DQ-LABEL: uitofp_load_4i64_to_4f32:
4566; AVX512DQ:       # %bb.0:
4567; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
4568; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
4569; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4570; AVX512DQ-NEXT:    vzeroupper
4571; AVX512DQ-NEXT:    retq
4572;
4573; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f32:
4574; AVX512VLDQ:       # %bb.0:
4575; AVX512VLDQ-NEXT:    vcvtuqq2psy (%rdi), %xmm0
4576; AVX512VLDQ-NEXT:    retq
4577  %ld = load <4 x i64>, <4 x i64> *%a
4578  %cvt = uitofp <4 x i64> %ld to <4 x float>
4579  ret <4 x float> %cvt
4580}
4581
4582define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) {
4583; SSE2-LABEL: uitofp_load_4i32_to_4f32:
4584; SSE2:       # %bb.0:
4585; SSE2-NEXT:    movdqa (%rdi), %xmm0
4586; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
4587; SSE2-NEXT:    pand %xmm0, %xmm1
4588; SSE2-NEXT:    por {{.*}}(%rip), %xmm1
4589; SSE2-NEXT:    psrld $16, %xmm0
4590; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
4591; SSE2-NEXT:    subps {{.*}}(%rip), %xmm0
4592; SSE2-NEXT:    addps %xmm1, %xmm0
4593; SSE2-NEXT:    retq
4594;
4595; SSE41-LABEL: uitofp_load_4i32_to_4f32:
4596; SSE41:       # %bb.0:
4597; SSE41-NEXT:    movdqa (%rdi), %xmm0
4598; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
4599; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
4600; SSE41-NEXT:    psrld $16, %xmm0
4601; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
4602; SSE41-NEXT:    subps {{.*}}(%rip), %xmm0
4603; SSE41-NEXT:    addps %xmm1, %xmm0
4604; SSE41-NEXT:    retq
4605;
4606; AVX1-LABEL: uitofp_load_4i32_to_4f32:
4607; AVX1:       # %bb.0:
4608; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
4609; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
4610; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
4611; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
4612; AVX1-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
4613; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
4614; AVX1-NEXT:    retq
4615;
4616; AVX2-LABEL: uitofp_load_4i32_to_4f32:
4617; AVX2:       # %bb.0:
4618; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
4619; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
4620; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
4621; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
4622; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928]
4623; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
4624; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
4625; AVX2-NEXT:    vsubps %xmm2, %xmm0, %xmm0
4626; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
4627; AVX2-NEXT:    retq
4628;
4629; AVX512F-LABEL: uitofp_load_4i32_to_4f32:
4630; AVX512F:       # %bb.0:
4631; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
4632; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
4633; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
4634; AVX512F-NEXT:    vzeroupper
4635; AVX512F-NEXT:    retq
4636;
4637; AVX512VL-LABEL: uitofp_load_4i32_to_4f32:
4638; AVX512VL:       # %bb.0:
4639; AVX512VL-NEXT:    vcvtudq2ps (%rdi), %xmm0
4640; AVX512VL-NEXT:    retq
4641;
4642; AVX512DQ-LABEL: uitofp_load_4i32_to_4f32:
4643; AVX512DQ:       # %bb.0:
4644; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
4645; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
4646; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
4647; AVX512DQ-NEXT:    vzeroupper
4648; AVX512DQ-NEXT:    retq
4649;
4650; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f32:
4651; AVX512VLDQ:       # %bb.0:
4652; AVX512VLDQ-NEXT:    vcvtudq2ps (%rdi), %xmm0
4653; AVX512VLDQ-NEXT:    retq
4654  %ld = load <4 x i32>, <4 x i32> *%a
4655  %cvt = uitofp <4 x i32> %ld to <4 x float>
4656  ret <4 x float> %cvt
4657}
4658
4659define <4 x float> @uitofp_load_4i16_to_4f32(<4 x i16> *%a) {
4660; SSE2-LABEL: uitofp_load_4i16_to_4f32:
4661; SSE2:       # %bb.0:
4662; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4663; SSE2-NEXT:    pxor %xmm1, %xmm1
4664; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4665; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
4666; SSE2-NEXT:    retq
4667;
4668; SSE41-LABEL: uitofp_load_4i16_to_4f32:
4669; SSE41:       # %bb.0:
4670; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
4671; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
4672; SSE41-NEXT:    retq
4673;
4674; AVX-LABEL: uitofp_load_4i16_to_4f32:
4675; AVX:       # %bb.0:
4676; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
4677; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
4678; AVX-NEXT:    retq
4679  %ld = load <4 x i16>, <4 x i16> *%a
4680  %cvt = uitofp <4 x i16> %ld to <4 x float>
4681  ret <4 x float> %cvt
4682}
4683
4684define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) {
4685; SSE2-LABEL: uitofp_load_4i8_to_4f32:
4686; SSE2:       # %bb.0:
4687; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
4688; SSE2-NEXT:    pxor %xmm1, %xmm1
4689; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4690; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4691; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
4692; SSE2-NEXT:    retq
4693;
4694; SSE41-LABEL: uitofp_load_4i8_to_4f32:
4695; SSE41:       # %bb.0:
4696; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
4697; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
4698; SSE41-NEXT:    retq
4699;
4700; AVX-LABEL: uitofp_load_4i8_to_4f32:
4701; AVX:       # %bb.0:
4702; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
4703; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
4704; AVX-NEXT:    retq
4705  %ld = load <4 x i8>, <4 x i8> *%a
4706  %cvt = uitofp <4 x i8> %ld to <4 x float>
4707  ret <4 x float> %cvt
4708}
4709
4710define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
4711; SSE2-LABEL: uitofp_load_8i64_to_8f32:
4712; SSE2:       # %bb.0:
4713; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
4714; SSE2-NEXT:    movq %xmm0, %rax
4715; SSE2-NEXT:    testq %rax, %rax
4716; SSE2-NEXT:    js .LBB87_1
4717; SSE2-NEXT:  # %bb.2:
4718; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
4719; SSE2-NEXT:    jmp .LBB87_3
4720; SSE2-NEXT:  .LBB87_1:
4721; SSE2-NEXT:    movq %rax, %rcx
4722; SSE2-NEXT:    shrq %rcx
4723; SSE2-NEXT:    andl $1, %eax
4724; SSE2-NEXT:    orq %rcx, %rax
4725; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
4726; SSE2-NEXT:    addss %xmm2, %xmm2
4727; SSE2-NEXT:  .LBB87_3:
4728; SSE2-NEXT:    movdqa (%rdi), %xmm3
4729; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4730; SSE2-NEXT:    movq %xmm0, %rax
4731; SSE2-NEXT:    testq %rax, %rax
4732; SSE2-NEXT:    js .LBB87_4
4733; SSE2-NEXT:  # %bb.5:
4734; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
4735; SSE2-NEXT:    jmp .LBB87_6
4736; SSE2-NEXT:  .LBB87_4:
4737; SSE2-NEXT:    movq %rax, %rcx
4738; SSE2-NEXT:    shrq %rcx
4739; SSE2-NEXT:    andl $1, %eax
4740; SSE2-NEXT:    orq %rcx, %rax
4741; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
4742; SSE2-NEXT:    addss %xmm1, %xmm1
4743; SSE2-NEXT:  .LBB87_6:
4744; SSE2-NEXT:    movq %xmm3, %rax
4745; SSE2-NEXT:    testq %rax, %rax
4746; SSE2-NEXT:    js .LBB87_7
4747; SSE2-NEXT:  # %bb.8:
4748; SSE2-NEXT:    xorps %xmm0, %xmm0
4749; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
4750; SSE2-NEXT:    jmp .LBB87_9
4751; SSE2-NEXT:  .LBB87_7:
4752; SSE2-NEXT:    movq %rax, %rcx
4753; SSE2-NEXT:    shrq %rcx
4754; SSE2-NEXT:    andl $1, %eax
4755; SSE2-NEXT:    orq %rcx, %rax
4756; SSE2-NEXT:    xorps %xmm0, %xmm0
4757; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
4758; SSE2-NEXT:    addss %xmm0, %xmm0
4759; SSE2-NEXT:  .LBB87_9:
4760; SSE2-NEXT:    movdqa 48(%rdi), %xmm6
4761; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
4762; SSE2-NEXT:    movq %xmm3, %rax
4763; SSE2-NEXT:    testq %rax, %rax
4764; SSE2-NEXT:    js .LBB87_10
4765; SSE2-NEXT:  # %bb.11:
4766; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
4767; SSE2-NEXT:    jmp .LBB87_12
4768; SSE2-NEXT:  .LBB87_10:
4769; SSE2-NEXT:    movq %rax, %rcx
4770; SSE2-NEXT:    shrq %rcx
4771; SSE2-NEXT:    andl $1, %eax
4772; SSE2-NEXT:    orq %rcx, %rax
4773; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
4774; SSE2-NEXT:    addss %xmm4, %xmm4
4775; SSE2-NEXT:  .LBB87_12:
4776; SSE2-NEXT:    movq %xmm6, %rax
4777; SSE2-NEXT:    testq %rax, %rax
4778; SSE2-NEXT:    js .LBB87_13
4779; SSE2-NEXT:  # %bb.14:
4780; SSE2-NEXT:    xorps %xmm3, %xmm3
4781; SSE2-NEXT:    cvtsi2ss %rax, %xmm3
4782; SSE2-NEXT:    jmp .LBB87_15
4783; SSE2-NEXT:  .LBB87_13:
4784; SSE2-NEXT:    movq %rax, %rcx
4785; SSE2-NEXT:    shrq %rcx
4786; SSE2-NEXT:    andl $1, %eax
4787; SSE2-NEXT:    orq %rcx, %rax
4788; SSE2-NEXT:    xorps %xmm3, %xmm3
4789; SSE2-NEXT:    cvtsi2ss %rax, %xmm3
4790; SSE2-NEXT:    addss %xmm3, %xmm3
4791; SSE2-NEXT:  .LBB87_15:
4792; SSE2-NEXT:    movdqa 32(%rdi), %xmm5
4793; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
4794; SSE2-NEXT:    movq %xmm6, %rax
4795; SSE2-NEXT:    testq %rax, %rax
4796; SSE2-NEXT:    js .LBB87_16
4797; SSE2-NEXT:  # %bb.17:
4798; SSE2-NEXT:    xorps %xmm6, %xmm6
4799; SSE2-NEXT:    cvtsi2ss %rax, %xmm6
4800; SSE2-NEXT:    jmp .LBB87_18
4801; SSE2-NEXT:  .LBB87_16:
4802; SSE2-NEXT:    movq %rax, %rcx
4803; SSE2-NEXT:    shrq %rcx
4804; SSE2-NEXT:    andl $1, %eax
4805; SSE2-NEXT:    orq %rcx, %rax
4806; SSE2-NEXT:    xorps %xmm6, %xmm6
4807; SSE2-NEXT:    cvtsi2ss %rax, %xmm6
4808; SSE2-NEXT:    addss %xmm6, %xmm6
4809; SSE2-NEXT:  .LBB87_18:
4810; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4811; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
4812; SSE2-NEXT:    movq %xmm5, %rax
4813; SSE2-NEXT:    testq %rax, %rax
4814; SSE2-NEXT:    js .LBB87_19
4815; SSE2-NEXT:  # %bb.20:
4816; SSE2-NEXT:    xorps %xmm1, %xmm1
4817; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
4818; SSE2-NEXT:    jmp .LBB87_21
4819; SSE2-NEXT:  .LBB87_19:
4820; SSE2-NEXT:    movq %rax, %rcx
4821; SSE2-NEXT:    shrq %rcx
4822; SSE2-NEXT:    andl $1, %eax
4823; SSE2-NEXT:    orq %rcx, %rax
4824; SSE2-NEXT:    xorps %xmm1, %xmm1
4825; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
4826; SSE2-NEXT:    addss %xmm1, %xmm1
4827; SSE2-NEXT:  .LBB87_21:
4828; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
4829; SSE2-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
4830; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
4831; SSE2-NEXT:    movq %xmm2, %rax
4832; SSE2-NEXT:    testq %rax, %rax
4833; SSE2-NEXT:    js .LBB87_22
4834; SSE2-NEXT:  # %bb.23:
4835; SSE2-NEXT:    xorps %xmm2, %xmm2
4836; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
4837; SSE2-NEXT:    jmp .LBB87_24
4838; SSE2-NEXT:  .LBB87_22:
4839; SSE2-NEXT:    movq %rax, %rcx
4840; SSE2-NEXT:    shrq %rcx
4841; SSE2-NEXT:    andl $1, %eax
4842; SSE2-NEXT:    orq %rcx, %rax
4843; SSE2-NEXT:    xorps %xmm2, %xmm2
4844; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
4845; SSE2-NEXT:    addss %xmm2, %xmm2
4846; SSE2-NEXT:  .LBB87_24:
4847; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4848; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
4849; SSE2-NEXT:    retq
4850;
4851; SSE41-LABEL: uitofp_load_8i64_to_8f32:
4852; SSE41:       # %bb.0:
4853; SSE41-NEXT:    movdqa (%rdi), %xmm4
4854; SSE41-NEXT:    movdqa 16(%rdi), %xmm5
4855; SSE41-NEXT:    movdqa 32(%rdi), %xmm6
4856; SSE41-NEXT:    movdqa 48(%rdi), %xmm2
4857; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = [1,1]
4858; SSE41-NEXT:    movdqa %xmm4, %xmm0
4859; SSE41-NEXT:    pand %xmm7, %xmm0
4860; SSE41-NEXT:    movdqa %xmm4, %xmm1
4861; SSE41-NEXT:    psrlq $1, %xmm1
4862; SSE41-NEXT:    por %xmm0, %xmm1
4863; SSE41-NEXT:    movdqa %xmm4, %xmm3
4864; SSE41-NEXT:    movdqa %xmm4, %xmm0
4865; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
4866; SSE41-NEXT:    pextrq $1, %xmm3, %rax
4867; SSE41-NEXT:    xorps %xmm0, %xmm0
4868; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
4869; SSE41-NEXT:    movq %xmm3, %rax
4870; SSE41-NEXT:    xorps %xmm3, %xmm3
4871; SSE41-NEXT:    cvtsi2ss %rax, %xmm3
4872; SSE41-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[2,3]
4873; SSE41-NEXT:    movdqa %xmm5, %xmm0
4874; SSE41-NEXT:    pand %xmm7, %xmm0
4875; SSE41-NEXT:    movdqa %xmm5, %xmm1
4876; SSE41-NEXT:    psrlq $1, %xmm1
4877; SSE41-NEXT:    por %xmm0, %xmm1
4878; SSE41-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,3],xmm5[1,3]
4879; SSE41-NEXT:    movaps %xmm5, %xmm0
4880; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
4881; SSE41-NEXT:    movq %xmm5, %rax
4882; SSE41-NEXT:    xorps %xmm0, %xmm0
4883; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
4884; SSE41-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0],xmm3[3]
4885; SSE41-NEXT:    pextrq $1, %xmm5, %rax
4886; SSE41-NEXT:    xorps %xmm0, %xmm0
4887; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
4888; SSE41-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm0[0]
4889; SSE41-NEXT:    movaps %xmm3, %xmm1
4890; SSE41-NEXT:    addps %xmm3, %xmm1
4891; SSE41-NEXT:    movaps %xmm4, %xmm0
4892; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm3
4893; SSE41-NEXT:    movdqa %xmm6, %xmm0
4894; SSE41-NEXT:    pand %xmm7, %xmm0
4895; SSE41-NEXT:    movdqa %xmm6, %xmm1
4896; SSE41-NEXT:    psrlq $1, %xmm1
4897; SSE41-NEXT:    por %xmm0, %xmm1
4898; SSE41-NEXT:    movdqa %xmm6, %xmm4
4899; SSE41-NEXT:    movdqa %xmm6, %xmm0
4900; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
4901; SSE41-NEXT:    pextrq $1, %xmm4, %rax
4902; SSE41-NEXT:    xorps %xmm0, %xmm0
4903; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
4904; SSE41-NEXT:    movq %xmm4, %rax
4905; SSE41-NEXT:    xorps %xmm1, %xmm1
4906; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
4907; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
4908; SSE41-NEXT:    pand %xmm2, %xmm7
4909; SSE41-NEXT:    movdqa %xmm2, %xmm4
4910; SSE41-NEXT:    psrlq $1, %xmm4
4911; SSE41-NEXT:    por %xmm7, %xmm4
4912; SSE41-NEXT:    shufps {{.*#+}} xmm6 = xmm6[1,3],xmm2[1,3]
4913; SSE41-NEXT:    movaps %xmm2, %xmm0
4914; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
4915; SSE41-NEXT:    movq %xmm2, %rax
4916; SSE41-NEXT:    xorps %xmm0, %xmm0
4917; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
4918; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3]
4919; SSE41-NEXT:    pextrq $1, %xmm2, %rax
4920; SSE41-NEXT:    xorps %xmm0, %xmm0
4921; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
4922; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
4923; SSE41-NEXT:    movaps %xmm1, %xmm2
4924; SSE41-NEXT:    addps %xmm1, %xmm2
4925; SSE41-NEXT:    movaps %xmm6, %xmm0
4926; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm1
4927; SSE41-NEXT:    movaps %xmm3, %xmm0
4928; SSE41-NEXT:    retq
4929;
4930; AVX1-LABEL: uitofp_load_8i64_to_8f32:
4931; AVX1:       # %bb.0:
4932; AVX1-NEXT:    vmovapd (%rdi), %ymm2
4933; AVX1-NEXT:    vmovapd 32(%rdi), %ymm3
4934; AVX1-NEXT:    vmovapd {{.*#+}} ymm8 = [1,1,1,1]
4935; AVX1-NEXT:    vandpd %ymm3, %ymm8, %ymm5
4936; AVX1-NEXT:    vmovdqa (%rdi), %xmm9
4937; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
4938; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm6
4939; AVX1-NEXT:    vpsrlq $1, %xmm6, %xmm7
4940; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm4
4941; AVX1-NEXT:    vpsrlq $1, %xmm4, %xmm0
4942; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm7, %ymm0
4943; AVX1-NEXT:    vorpd %ymm5, %ymm0, %ymm0
4944; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm3, %ymm0
4945; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
4946; AVX1-NEXT:    vcvtsi2ss %rax, %xmm10, %xmm3
4947; AVX1-NEXT:    vmovq %xmm0, %rax
4948; AVX1-NEXT:    vcvtsi2ss %rax, %xmm10, %xmm5
4949; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[2,3]
4950; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4951; AVX1-NEXT:    vmovq %xmm0, %rax
4952; AVX1-NEXT:    vcvtsi2ss %rax, %xmm10, %xmm5
4953; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3]
4954; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
4955; AVX1-NEXT:    vcvtsi2ss %rax, %xmm10, %xmm0
4956; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
4957; AVX1-NEXT:    vaddps %xmm0, %xmm0, %xmm3
4958; AVX1-NEXT:    vpackssdw %xmm4, %xmm6, %xmm4
4959; AVX1-NEXT:    vblendvps %xmm4, %xmm3, %xmm0, %xmm0
4960; AVX1-NEXT:    vandpd %ymm2, %ymm8, %ymm3
4961; AVX1-NEXT:    vpsrlq $1, %xmm9, %xmm4
4962; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm5
4963; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
4964; AVX1-NEXT:    vorpd %ymm3, %ymm4, %ymm3
4965; AVX1-NEXT:    vblendvpd %ymm2, %ymm3, %ymm2, %ymm2
4966; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
4967; AVX1-NEXT:    vcvtsi2ss %rax, %xmm10, %xmm3
4968; AVX1-NEXT:    vmovq %xmm2, %rax
4969; AVX1-NEXT:    vcvtsi2ss %rax, %xmm10, %xmm4
4970; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
4971; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
4972; AVX1-NEXT:    vmovq %xmm2, %rax
4973; AVX1-NEXT:    vcvtsi2ss %rax, %xmm10, %xmm4
4974; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
4975; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
4976; AVX1-NEXT:    vcvtsi2ss %rax, %xmm10, %xmm2
4977; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0]
4978; AVX1-NEXT:    vaddps %xmm2, %xmm2, %xmm3
4979; AVX1-NEXT:    vpackssdw %xmm1, %xmm9, %xmm1
4980; AVX1-NEXT:    vblendvps %xmm1, %xmm3, %xmm2, %xmm1
4981; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
4982; AVX1-NEXT:    retq
4983;
4984; AVX2-LABEL: uitofp_load_8i64_to_8f32:
4985; AVX2:       # %bb.0:
4986; AVX2-NEXT:    vmovaps (%rdi), %ymm0
4987; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
4988; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
4989; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
4990; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm4
4991; AVX2-NEXT:    vpor %ymm3, %ymm4, %ymm3
4992; AVX2-NEXT:    vblendvpd %ymm1, %ymm3, %ymm1, %ymm1
4993; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
4994; AVX2-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
4995; AVX2-NEXT:    vmovq %xmm1, %rax
4996; AVX2-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
4997; AVX2-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
4998; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
4999; AVX2-NEXT:    vmovq %xmm1, %rax
5000; AVX2-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
5001; AVX2-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
5002; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
5003; AVX2-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm1
5004; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
5005; AVX2-NEXT:    vaddps %xmm1, %xmm1, %xmm3
5006; AVX2-NEXT:    vmovdqa (%rdi), %xmm4
5007; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm5
5008; AVX2-NEXT:    vpackssdw 48(%rdi), %xmm5, %xmm5
5009; AVX2-NEXT:    vblendvps %xmm5, %xmm3, %xmm1, %xmm1
5010; AVX2-NEXT:    vandps %ymm2, %ymm0, %ymm2
5011; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm3
5012; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
5013; AVX2-NEXT:    vblendvpd %ymm0, %ymm2, %ymm0, %ymm0
5014; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
5015; AVX2-NEXT:    vcvtsi2ss %rax, %xmm6, %xmm2
5016; AVX2-NEXT:    vmovq %xmm0, %rax
5017; AVX2-NEXT:    vcvtsi2ss %rax, %xmm6, %xmm3
5018; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
5019; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
5020; AVX2-NEXT:    vmovq %xmm0, %rax
5021; AVX2-NEXT:    vcvtsi2ss %rax, %xmm6, %xmm3
5022; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
5023; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
5024; AVX2-NEXT:    vcvtsi2ss %rax, %xmm6, %xmm0
5025; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
5026; AVX2-NEXT:    vaddps %xmm0, %xmm0, %xmm2
5027; AVX2-NEXT:    vpackssdw 16(%rdi), %xmm4, %xmm3
5028; AVX2-NEXT:    vblendvps %xmm3, %xmm2, %xmm0, %xmm0
5029; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
5030; AVX2-NEXT:    retq
5031;
5032; AVX512F-LABEL: uitofp_load_8i64_to_8f32:
5033; AVX512F:       # %bb.0:
5034; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
5035; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
5036; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
5037; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
5038; AVX512F-NEXT:    vpextrq $1, %xmm2, %rax
5039; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm4
5040; AVX512F-NEXT:    vmovq %xmm2, %rax
5041; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm2
5042; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
5043; AVX512F-NEXT:    vmovq %xmm3, %rax
5044; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm4
5045; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
5046; AVX512F-NEXT:    vpextrq $1, %xmm3, %rax
5047; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
5048; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
5049; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
5050; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
5051; AVX512F-NEXT:    vmovq %xmm0, %rax
5052; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm0
5053; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
5054; AVX512F-NEXT:    vmovq %xmm1, %rax
5055; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
5056; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
5057; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
5058; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm1
5059; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
5060; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
5061; AVX512F-NEXT:    retq
5062;
5063; AVX512VL-LABEL: uitofp_load_8i64_to_8f32:
5064; AVX512VL:       # %bb.0:
5065; AVX512VL-NEXT:    vmovaps (%rdi), %xmm0
5066; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
5067; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
5068; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
5069; AVX512VL-NEXT:    vpextrq $1, %xmm2, %rax
5070; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm4
5071; AVX512VL-NEXT:    vmovq %xmm2, %rax
5072; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm2
5073; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
5074; AVX512VL-NEXT:    vmovq %xmm3, %rax
5075; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm4
5076; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
5077; AVX512VL-NEXT:    vpextrq $1, %xmm3, %rax
5078; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
5079; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
5080; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
5081; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
5082; AVX512VL-NEXT:    vmovq %xmm0, %rax
5083; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm0
5084; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
5085; AVX512VL-NEXT:    vmovq %xmm1, %rax
5086; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
5087; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
5088; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
5089; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm1
5090; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
5091; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
5092; AVX512VL-NEXT:    retq
5093;
5094; AVX512DQ-LABEL: uitofp_load_8i64_to_8f32:
5095; AVX512DQ:       # %bb.0:
5096; AVX512DQ-NEXT:    vcvtuqq2ps (%rdi), %ymm0
5097; AVX512DQ-NEXT:    retq
5098;
5099; AVX512VLDQ-LABEL: uitofp_load_8i64_to_8f32:
5100; AVX512VLDQ:       # %bb.0:
5101; AVX512VLDQ-NEXT:    vcvtuqq2ps (%rdi), %ymm0
5102; AVX512VLDQ-NEXT:    retq
5103  %ld = load <8 x i64>, <8 x i64> *%a
5104  %cvt = uitofp <8 x i64> %ld to <8 x float>
5105  ret <8 x float> %cvt
5106}
5107
5108define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) {
5109; SSE2-LABEL: uitofp_load_8i32_to_8f32:
5110; SSE2:       # %bb.0:
5111; SSE2-NEXT:    movdqa (%rdi), %xmm0
5112; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
5113; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
5114; SSE2-NEXT:    movdqa %xmm0, %xmm3
5115; SSE2-NEXT:    pand %xmm2, %xmm3
5116; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
5117; SSE2-NEXT:    por %xmm4, %xmm3
5118; SSE2-NEXT:    psrld $16, %xmm0
5119; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
5120; SSE2-NEXT:    por %xmm5, %xmm0
5121; SSE2-NEXT:    movaps {{.*#+}} xmm6 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
5122; SSE2-NEXT:    subps %xmm6, %xmm0
5123; SSE2-NEXT:    addps %xmm3, %xmm0
5124; SSE2-NEXT:    pand %xmm1, %xmm2
5125; SSE2-NEXT:    por %xmm4, %xmm2
5126; SSE2-NEXT:    psrld $16, %xmm1
5127; SSE2-NEXT:    por %xmm5, %xmm1
5128; SSE2-NEXT:    subps %xmm6, %xmm1
5129; SSE2-NEXT:    addps %xmm2, %xmm1
5130; SSE2-NEXT:    retq
5131;
5132; SSE41-LABEL: uitofp_load_8i32_to_8f32:
5133; SSE41:       # %bb.0:
5134; SSE41-NEXT:    movdqa (%rdi), %xmm0
5135; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
5136; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200]
5137; SSE41-NEXT:    movdqa %xmm0, %xmm3
5138; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
5139; SSE41-NEXT:    psrld $16, %xmm0
5140; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928]
5141; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
5142; SSE41-NEXT:    movaps {{.*#+}} xmm5 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
5143; SSE41-NEXT:    subps %xmm5, %xmm0
5144; SSE41-NEXT:    addps %xmm3, %xmm0
5145; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
5146; SSE41-NEXT:    psrld $16, %xmm1
5147; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
5148; SSE41-NEXT:    subps %xmm5, %xmm1
5149; SSE41-NEXT:    addps %xmm2, %xmm1
5150; SSE41-NEXT:    retq
5151;
5152; AVX1-LABEL: uitofp_load_8i32_to_8f32:
5153; AVX1:       # %bb.0:
5154; AVX1-NEXT:    vmovaps (%rdi), %ymm0
5155; AVX1-NEXT:    vmovdqa (%rdi), %xmm1
5156; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm2
5157; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
5158; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
5159; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
5160; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
5161; AVX1-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
5162; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
5163; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
5164; AVX1-NEXT:    vaddps %ymm0, %ymm1, %ymm0
5165; AVX1-NEXT:    retq
5166;
5167; AVX2-LABEL: uitofp_load_8i32_to_8f32:
5168; AVX2:       # %bb.0:
5169; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5170; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
5171; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
5172; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
5173; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
5174; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
5175; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
5176; AVX2-NEXT:    vsubps %ymm2, %ymm0, %ymm0
5177; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
5178; AVX2-NEXT:    retq
5179;
5180; AVX512F-LABEL: uitofp_load_8i32_to_8f32:
5181; AVX512F:       # %bb.0:
5182; AVX512F-NEXT:    vmovaps (%rdi), %ymm0
5183; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
5184; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
5185; AVX512F-NEXT:    retq
5186;
5187; AVX512VL-LABEL: uitofp_load_8i32_to_8f32:
5188; AVX512VL:       # %bb.0:
5189; AVX512VL-NEXT:    vcvtudq2ps (%rdi), %ymm0
5190; AVX512VL-NEXT:    retq
5191;
5192; AVX512DQ-LABEL: uitofp_load_8i32_to_8f32:
5193; AVX512DQ:       # %bb.0:
5194; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
5195; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
5196; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
5197; AVX512DQ-NEXT:    retq
5198;
5199; AVX512VLDQ-LABEL: uitofp_load_8i32_to_8f32:
5200; AVX512VLDQ:       # %bb.0:
5201; AVX512VLDQ-NEXT:    vcvtudq2ps (%rdi), %ymm0
5202; AVX512VLDQ-NEXT:    retq
5203  %ld = load <8 x i32>, <8 x i32> *%a
5204  %cvt = uitofp <8 x i32> %ld to <8 x float>
5205  ret <8 x float> %cvt
5206}
5207
5208define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) {
5209; SSE2-LABEL: uitofp_load_8i16_to_8f32:
5210; SSE2:       # %bb.0:
5211; SSE2-NEXT:    movdqa (%rdi), %xmm1
5212; SSE2-NEXT:    pxor %xmm2, %xmm2
5213; SSE2-NEXT:    movdqa %xmm1, %xmm0
5214; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
5215; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
5216; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
5217; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm1
5218; SSE2-NEXT:    retq
5219;
5220; SSE41-LABEL: uitofp_load_8i16_to_8f32:
5221; SSE41:       # %bb.0:
5222; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
5223; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
5224; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
5225; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm1
5226; SSE41-NEXT:    retq
5227;
5228; AVX1-LABEL: uitofp_load_8i16_to_8f32:
5229; AVX1:       # %bb.0:
5230; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
5231; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
5232; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
5233; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
5234; AVX1-NEXT:    retq
5235;
5236; AVX2-LABEL: uitofp_load_8i16_to_8f32:
5237; AVX2:       # %bb.0:
5238; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
5239; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
5240; AVX2-NEXT:    retq
5241;
5242; AVX512-LABEL: uitofp_load_8i16_to_8f32:
5243; AVX512:       # %bb.0:
5244; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
5245; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
5246; AVX512-NEXT:    retq
5247  %ld = load <8 x i16>, <8 x i16> *%a
5248  %cvt = uitofp <8 x i16> %ld to <8 x float>
5249  ret <8 x float> %cvt
5250}
5251
5252define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) {
5253; SSE2-LABEL: uitofp_load_8i8_to_8f32:
5254; SSE2:       # %bb.0:
5255; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
5256; SSE2-NEXT:    pxor %xmm2, %xmm2
5257; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
5258; SSE2-NEXT:    movdqa %xmm1, %xmm0
5259; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
5260; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
5261; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
5262; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm1
5263; SSE2-NEXT:    retq
5264;
5265; SSE41-LABEL: uitofp_load_8i8_to_8f32:
5266; SSE41:       # %bb.0:
5267; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
5268; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
5269; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
5270; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm1
5271; SSE41-NEXT:    retq
5272;
5273; AVX1-LABEL: uitofp_load_8i8_to_8f32:
5274; AVX1:       # %bb.0:
5275; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
5276; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
5277; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
5278; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
5279; AVX1-NEXT:    retq
5280;
5281; AVX2-LABEL: uitofp_load_8i8_to_8f32:
5282; AVX2:       # %bb.0:
5283; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
5284; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
5285; AVX2-NEXT:    retq
5286;
5287; AVX512-LABEL: uitofp_load_8i8_to_8f32:
5288; AVX512:       # %bb.0:
5289; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
5290; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
5291; AVX512-NEXT:    retq
5292  %ld = load <8 x i8>, <8 x i8> *%a
5293  %cvt = uitofp <8 x i8> %ld to <8 x float>
5294  ret <8 x float> %cvt
5295}
5296
5297;
5298; Aggregates
5299;
5300
5301%Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }>
5302define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
5303; SSE2-LABEL: aggregate_sitofp_8i16_to_8f32:
5304; SSE2:       # %bb.0:
5305; SSE2-NEXT:    movq 24(%rdi), %rax
5306; SSE2-NEXT:    movdqu 8(%rdi), %xmm0
5307; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
5308; SSE2-NEXT:    psrad $16, %xmm1
5309; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm1
5310; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
5311; SSE2-NEXT:    psrad $16, %xmm0
5312; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
5313; SSE2-NEXT:    movaps %xmm0, 16(%rax)
5314; SSE2-NEXT:    movaps %xmm1, (%rax)
5315; SSE2-NEXT:    retq
5316;
5317; SSE41-LABEL: aggregate_sitofp_8i16_to_8f32:
5318; SSE41:       # %bb.0:
5319; SSE41-NEXT:    movq 24(%rdi), %rax
5320; SSE41-NEXT:    pmovsxwd 16(%rdi), %xmm0
5321; SSE41-NEXT:    pmovsxwd 8(%rdi), %xmm1
5322; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm1
5323; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
5324; SSE41-NEXT:    movaps %xmm0, 16(%rax)
5325; SSE41-NEXT:    movaps %xmm1, (%rax)
5326; SSE41-NEXT:    retq
5327;
5328; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32:
5329; AVX1:       # %bb.0:
5330; AVX1-NEXT:    movq 24(%rdi), %rax
5331; AVX1-NEXT:    vpmovsxwd 16(%rdi), %xmm0
5332; AVX1-NEXT:    vpmovsxwd 8(%rdi), %xmm1
5333; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
5334; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
5335; AVX1-NEXT:    vmovaps %ymm0, (%rax)
5336; AVX1-NEXT:    vzeroupper
5337; AVX1-NEXT:    retq
5338;
5339; AVX2-LABEL: aggregate_sitofp_8i16_to_8f32:
5340; AVX2:       # %bb.0:
5341; AVX2-NEXT:    movq 24(%rdi), %rax
5342; AVX2-NEXT:    vpmovsxwd 8(%rdi), %ymm0
5343; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
5344; AVX2-NEXT:    vmovaps %ymm0, (%rax)
5345; AVX2-NEXT:    vzeroupper
5346; AVX2-NEXT:    retq
5347;
5348; AVX512-LABEL: aggregate_sitofp_8i16_to_8f32:
5349; AVX512:       # %bb.0:
5350; AVX512-NEXT:    movq 24(%rdi), %rax
5351; AVX512-NEXT:    vpmovsxwd 8(%rdi), %ymm0
5352; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
5353; AVX512-NEXT:    vmovaps %ymm0, (%rax)
5354; AVX512-NEXT:    vzeroupper
5355; AVX512-NEXT:    retq
5356 %1 = load %Arguments, %Arguments* %a0, align 1
5357 %2 = extractvalue %Arguments %1, 1
5358 %3 = extractvalue %Arguments %1, 2
5359 %4 = sitofp <8 x i16> %2 to <8 x float>
5360 store <8 x float> %4, <8 x float>* %3, align 32
5361 ret void
5362}
5363
5364define <2 x double> @sitofp_i32_to_2f64(<2 x double> %a0, i32 %a1) nounwind {
5365; SSE-LABEL: sitofp_i32_to_2f64:
5366; SSE:       # %bb.0:
5367; SSE-NEXT:    cvtsi2sd %edi, %xmm0
5368; SSE-NEXT:    retq
5369;
5370; AVX-LABEL: sitofp_i32_to_2f64:
5371; AVX:       # %bb.0:
5372; AVX-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0
5373; AVX-NEXT:    retq
5374  %cvt = sitofp i32 %a1 to double
5375  %res = insertelement <2 x double> %a0, double %cvt, i32 0
5376  ret <2 x double> %res
5377}
5378
5379define <4 x float> @sitofp_i32_to_4f32(<4 x float> %a0, i32 %a1) nounwind {
5380; SSE-LABEL: sitofp_i32_to_4f32:
5381; SSE:       # %bb.0:
5382; SSE-NEXT:    cvtsi2ss %edi, %xmm0
5383; SSE-NEXT:    retq
5384;
5385; AVX-LABEL: sitofp_i32_to_4f32:
5386; AVX:       # %bb.0:
5387; AVX-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
5388; AVX-NEXT:    retq
5389  %cvt = sitofp i32 %a1 to float
5390  %res = insertelement <4 x float> %a0, float %cvt, i32 0
5391  ret <4 x float> %res
5392}
5393
5394define <2 x double> @sitofp_i64_to_2f64(<2 x double> %a0, i64 %a1) nounwind {
5395; SSE-LABEL: sitofp_i64_to_2f64:
5396; SSE:       # %bb.0:
5397; SSE-NEXT:    cvtsi2sd %rdi, %xmm0
5398; SSE-NEXT:    retq
5399;
5400; AVX-LABEL: sitofp_i64_to_2f64:
5401; AVX:       # %bb.0:
5402; AVX-NEXT:    vcvtsi2sd %rdi, %xmm0, %xmm0
5403; AVX-NEXT:    retq
5404  %cvt = sitofp i64 %a1 to double
5405  %res = insertelement <2 x double> %a0, double %cvt, i32 0
5406  ret <2 x double> %res
5407}
5408
5409define <4 x float> @sitofp_i64_to_4f32(<4 x float> %a0, i64 %a1) nounwind {
5410; SSE-LABEL: sitofp_i64_to_4f32:
5411; SSE:       # %bb.0:
5412; SSE-NEXT:    cvtsi2ss %rdi, %xmm0
5413; SSE-NEXT:    retq
5414;
5415; AVX-LABEL: sitofp_i64_to_4f32:
5416; AVX:       # %bb.0:
5417; AVX-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
5418; AVX-NEXT:    retq
5419  %cvt = sitofp i64 %a1 to float
5420  %res = insertelement <4 x float> %a0, float %cvt, i32 0
5421  ret <4 x float> %res
5422}
5423
5424; Extract from int vector and convert to FP.
5425
5426define float @extract0_sitofp_v4i32_f32(<4 x i32> %x) nounwind {
5427; SSE-LABEL: extract0_sitofp_v4i32_f32:
5428; SSE:       # %bb.0:
5429; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
5430; SSE-NEXT:    retq
5431;
5432; AVX-LABEL: extract0_sitofp_v4i32_f32:
5433; AVX:       # %bb.0:
5434; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
5435; AVX-NEXT:    retq
5436  %e = extractelement <4 x i32> %x, i32 0
5437  %r = sitofp i32 %e to float
5438  ret float %r
5439}
5440
5441define float @extract0_sitofp_v4i32_f32i_multiuse1(<4 x i32> %x) nounwind {
5442; SSE-LABEL: extract0_sitofp_v4i32_f32i_multiuse1:
5443; SSE:       # %bb.0:
5444; SSE-NEXT:    movd %xmm0, %eax
5445; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
5446; SSE-NEXT:    incl %eax
5447; SSE-NEXT:    cvtsi2ss %eax, %xmm1
5448; SSE-NEXT:    divss %xmm1, %xmm0
5449; SSE-NEXT:    retq
5450;
5451; AVX-LABEL: extract0_sitofp_v4i32_f32i_multiuse1:
5452; AVX:       # %bb.0:
5453; AVX-NEXT:    vmovd %xmm0, %eax
5454; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
5455; AVX-NEXT:    incl %eax
5456; AVX-NEXT:    vcvtsi2ss %eax, %xmm1, %xmm1
5457; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
5458; AVX-NEXT:    retq
5459  %e = extractelement <4 x i32> %x, i32 0
5460  %f = sitofp i32 %e to float
5461  %e1 = add i32 %e, 1
5462  %f1 = sitofp i32 %e1 to float
5463  %r = fdiv float %f, %f1
5464  ret float %r
5465}
5466
5467define float @extract0_sitofp_v4i32_f32_multiuse2(<4 x i32> %x, i32* %p) nounwind {
5468; SSE-LABEL: extract0_sitofp_v4i32_f32_multiuse2:
5469; SSE:       # %bb.0:
5470; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
5471; SSE-NEXT:    movss %xmm0, (%rdi)
5472; SSE-NEXT:    movaps %xmm1, %xmm0
5473; SSE-NEXT:    retq
5474;
5475; AVX-LABEL: extract0_sitofp_v4i32_f32_multiuse2:
5476; AVX:       # %bb.0:
5477; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm1
5478; AVX-NEXT:    vmovss %xmm0, (%rdi)
5479; AVX-NEXT:    vmovaps %xmm1, %xmm0
5480; AVX-NEXT:    retq
5481  %e = extractelement <4 x i32> %x, i32 0
5482  %r = sitofp i32 %e to float
5483  store i32 %e, i32* %p
5484  ret float %r
5485}
5486
5487define double @extract0_sitofp_v4i32_f64(<4 x i32> %x) nounwind {
5488; SSE-LABEL: extract0_sitofp_v4i32_f64:
5489; SSE:       # %bb.0:
5490; SSE-NEXT:    movd %xmm0, %eax
5491; SSE-NEXT:    xorps %xmm0, %xmm0
5492; SSE-NEXT:    cvtsi2sd %eax, %xmm0
5493; SSE-NEXT:    retq
5494;
5495; AVX-LABEL: extract0_sitofp_v4i32_f64:
5496; AVX:       # %bb.0:
5497; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
5498; AVX-NEXT:    retq
5499  %e = extractelement <4 x i32> %x, i32 0
5500  %r = sitofp i32 %e to double
5501  ret double %r
5502}
5503
5504define float @extract0_uitofp_v4i32_f32(<4 x i32> %x) nounwind {
5505; SSE-LABEL: extract0_uitofp_v4i32_f32:
5506; SSE:       # %bb.0:
5507; SSE-NEXT:    movd %xmm0, %eax
5508; SSE-NEXT:    xorps %xmm0, %xmm0
5509; SSE-NEXT:    cvtsi2ss %rax, %xmm0
5510; SSE-NEXT:    retq
5511;
5512; VEX-LABEL: extract0_uitofp_v4i32_f32:
5513; VEX:       # %bb.0:
5514; VEX-NEXT:    vmovd %xmm0, %eax
5515; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm0
5516; VEX-NEXT:    retq
5517;
5518; AVX512F-LABEL: extract0_uitofp_v4i32_f32:
5519; AVX512F:       # %bb.0:
5520; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
5521; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
5522; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
5523; AVX512F-NEXT:    vzeroupper
5524; AVX512F-NEXT:    retq
5525;
5526; AVX512VL-LABEL: extract0_uitofp_v4i32_f32:
5527; AVX512VL:       # %bb.0:
5528; AVX512VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
5529; AVX512VL-NEXT:    retq
5530;
5531; AVX512DQ-LABEL: extract0_uitofp_v4i32_f32:
5532; AVX512DQ:       # %bb.0:
5533; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
5534; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
5535; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
5536; AVX512DQ-NEXT:    vzeroupper
5537; AVX512DQ-NEXT:    retq
5538;
5539; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f32:
5540; AVX512VLDQ:       # %bb.0:
5541; AVX512VLDQ-NEXT:    vcvtudq2ps %xmm0, %xmm0
5542; AVX512VLDQ-NEXT:    retq
5543  %e = extractelement <4 x i32> %x, i32 0
5544  %r = uitofp i32 %e to float
5545  ret float %r
5546}
5547
5548define double @extract0_uitofp_v4i32_f64(<4 x i32> %x) nounwind {
5549; SSE-LABEL: extract0_uitofp_v4i32_f64:
5550; SSE:       # %bb.0:
5551; SSE-NEXT:    movd %xmm0, %eax
5552; SSE-NEXT:    xorps %xmm0, %xmm0
5553; SSE-NEXT:    cvtsi2sd %rax, %xmm0
5554; SSE-NEXT:    retq
5555;
5556; VEX-LABEL: extract0_uitofp_v4i32_f64:
5557; VEX:       # %bb.0:
5558; VEX-NEXT:    vmovd %xmm0, %eax
5559; VEX-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm0
5560; VEX-NEXT:    retq
5561;
5562; AVX512F-LABEL: extract0_uitofp_v4i32_f64:
5563; AVX512F:       # %bb.0:
5564; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
5565; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
5566; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
5567; AVX512F-NEXT:    vzeroupper
5568; AVX512F-NEXT:    retq
5569;
5570; AVX512VL-LABEL: extract0_uitofp_v4i32_f64:
5571; AVX512VL:       # %bb.0:
5572; AVX512VL-NEXT:    vcvtudq2pd %xmm0, %xmm0
5573; AVX512VL-NEXT:    retq
5574;
5575; AVX512DQ-LABEL: extract0_uitofp_v4i32_f64:
5576; AVX512DQ:       # %bb.0:
5577; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
5578; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
5579; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
5580; AVX512DQ-NEXT:    vzeroupper
5581; AVX512DQ-NEXT:    retq
5582;
5583; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f64:
5584; AVX512VLDQ:       # %bb.0:
5585; AVX512VLDQ-NEXT:    vcvtudq2pd %xmm0, %xmm0
5586; AVX512VLDQ-NEXT:    retq
5587  %e = extractelement <4 x i32> %x, i32 0
5588  %r = uitofp i32 %e to double
5589  ret double %r
5590}
5591
5592; Extract non-zero element from int vector and convert to FP.
5593
5594define float @extract3_sitofp_v4i32_f32(<4 x i32> %x) nounwind {
5595; SSE-LABEL: extract3_sitofp_v4i32_f32:
5596; SSE:       # %bb.0:
5597; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
5598; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
5599; SSE-NEXT:    retq
5600;
5601; AVX-LABEL: extract3_sitofp_v4i32_f32:
5602; AVX:       # %bb.0:
5603; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
5604; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
5605; AVX-NEXT:    retq
5606  %e = extractelement <4 x i32> %x, i32 3
5607  %r = sitofp i32 %e to float
5608  ret float %r
5609}
5610
5611define double @extract3_sitofp_v4i32_f64(<4 x i32> %x) nounwind {
5612; SSE2-LABEL: extract3_sitofp_v4i32_f64:
5613; SSE2:       # %bb.0:
5614; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
5615; SSE2-NEXT:    movd %xmm0, %eax
5616; SSE2-NEXT:    xorps %xmm0, %xmm0
5617; SSE2-NEXT:    cvtsi2sd %eax, %xmm0
5618; SSE2-NEXT:    retq
5619;
5620; SSE41-LABEL: extract3_sitofp_v4i32_f64:
5621; SSE41:       # %bb.0:
5622; SSE41-NEXT:    extractps $3, %xmm0, %eax
5623; SSE41-NEXT:    xorps %xmm0, %xmm0
5624; SSE41-NEXT:    cvtsi2sd %eax, %xmm0
5625; SSE41-NEXT:    retq
5626;
5627; AVX-LABEL: extract3_sitofp_v4i32_f64:
5628; AVX:       # %bb.0:
5629; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
5630; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
5631; AVX-NEXT:    retq
5632  %e = extractelement <4 x i32> %x, i32 3
5633  %r = sitofp i32 %e to double
5634  ret double %r
5635}
5636
5637define float @extract3_uitofp_v4i32_f32(<4 x i32> %x) nounwind {
5638; SSE2-LABEL: extract3_uitofp_v4i32_f32:
5639; SSE2:       # %bb.0:
5640; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
5641; SSE2-NEXT:    movd %xmm0, %eax
5642; SSE2-NEXT:    xorps %xmm0, %xmm0
5643; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
5644; SSE2-NEXT:    retq
5645;
5646; SSE41-LABEL: extract3_uitofp_v4i32_f32:
5647; SSE41:       # %bb.0:
5648; SSE41-NEXT:    extractps $3, %xmm0, %eax
5649; SSE41-NEXT:    xorps %xmm0, %xmm0
5650; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
5651; SSE41-NEXT:    retq
5652;
5653; VEX-LABEL: extract3_uitofp_v4i32_f32:
5654; VEX:       # %bb.0:
5655; VEX-NEXT:    vextractps $3, %xmm0, %eax
5656; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm0
5657; VEX-NEXT:    retq
5658;
5659; AVX512F-LABEL: extract3_uitofp_v4i32_f32:
5660; AVX512F:       # %bb.0:
5661; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
5662; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
5663; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
5664; AVX512F-NEXT:    vzeroupper
5665; AVX512F-NEXT:    retq
5666;
5667; AVX512VL-LABEL: extract3_uitofp_v4i32_f32:
5668; AVX512VL:       # %bb.0:
5669; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
5670; AVX512VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
5671; AVX512VL-NEXT:    retq
5672;
5673; AVX512DQ-LABEL: extract3_uitofp_v4i32_f32:
5674; AVX512DQ:       # %bb.0:
5675; AVX512DQ-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
5676; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
5677; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
5678; AVX512DQ-NEXT:    vzeroupper
5679; AVX512DQ-NEXT:    retq
5680;
5681; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f32:
5682; AVX512VLDQ:       # %bb.0:
5683; AVX512VLDQ-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
5684; AVX512VLDQ-NEXT:    vcvtudq2ps %xmm0, %xmm0
5685; AVX512VLDQ-NEXT:    retq
5686  %e = extractelement <4 x i32> %x, i32 3
5687  %r = uitofp i32 %e to float
5688  ret float %r
5689}
5690
5691define double @extract3_uitofp_v4i32_f64(<4 x i32> %x) nounwind {
5692; SSE2-LABEL: extract3_uitofp_v4i32_f64:
5693; SSE2:       # %bb.0:
5694; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
5695; SSE2-NEXT:    movd %xmm0, %eax
5696; SSE2-NEXT:    xorps %xmm0, %xmm0
5697; SSE2-NEXT:    cvtsi2sd %rax, %xmm0
5698; SSE2-NEXT:    retq
5699;
5700; SSE41-LABEL: extract3_uitofp_v4i32_f64:
5701; SSE41:       # %bb.0:
5702; SSE41-NEXT:    extractps $3, %xmm0, %eax
5703; SSE41-NEXT:    xorps %xmm0, %xmm0
5704; SSE41-NEXT:    cvtsi2sd %rax, %xmm0
5705; SSE41-NEXT:    retq
5706;
5707; VEX-LABEL: extract3_uitofp_v4i32_f64:
5708; VEX:       # %bb.0:
5709; VEX-NEXT:    vextractps $3, %xmm0, %eax
5710; VEX-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm0
5711; VEX-NEXT:    retq
5712;
5713; AVX512F-LABEL: extract3_uitofp_v4i32_f64:
5714; AVX512F:       # %bb.0:
5715; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
5716; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
5717; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
5718; AVX512F-NEXT:    vzeroupper
5719; AVX512F-NEXT:    retq
5720;
5721; AVX512VL-LABEL: extract3_uitofp_v4i32_f64:
5722; AVX512VL:       # %bb.0:
5723; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
5724; AVX512VL-NEXT:    vcvtudq2pd %xmm0, %xmm0
5725; AVX512VL-NEXT:    retq
5726;
5727; AVX512DQ-LABEL: extract3_uitofp_v4i32_f64:
5728; AVX512DQ:       # %bb.0:
5729; AVX512DQ-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
5730; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
5731; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
5732; AVX512DQ-NEXT:    vzeroupper
5733; AVX512DQ-NEXT:    retq
5734;
5735; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f64:
5736; AVX512VLDQ:       # %bb.0:
5737; AVX512VLDQ-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
5738; AVX512VLDQ-NEXT:    vcvtudq2pd %xmm0, %xmm0
5739; AVX512VLDQ-NEXT:    retq
5740  %e = extractelement <4 x i32> %x, i32 3
5741  %r = uitofp i32 %e to double
5742  ret double %r
5743}
5744
5745define void @PR43609(double* nocapture %x, <2 x i64> %y) #0 {
5746; SSE2-LABEL: PR43609:
5747; SSE2:       # %bb.0:
5748; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2,2]
5749; SSE2-NEXT:    paddq %xmm0, %xmm1
5750; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
5751; SSE2-NEXT:    movdqa %xmm0, %xmm3
5752; SSE2-NEXT:    pand %xmm2, %xmm3
5753; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
5754; SSE2-NEXT:    por %xmm4, %xmm3
5755; SSE2-NEXT:    psrlq $32, %xmm0
5756; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
5757; SSE2-NEXT:    por %xmm5, %xmm0
5758; SSE2-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
5759; SSE2-NEXT:    subpd %xmm6, %xmm0
5760; SSE2-NEXT:    addpd %xmm3, %xmm0
5761; SSE2-NEXT:    pand %xmm1, %xmm2
5762; SSE2-NEXT:    por %xmm4, %xmm2
5763; SSE2-NEXT:    psrlq $32, %xmm1
5764; SSE2-NEXT:    por %xmm5, %xmm1
5765; SSE2-NEXT:    subpd %xmm6, %xmm1
5766; SSE2-NEXT:    addpd %xmm2, %xmm1
5767; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
5768; SSE2-NEXT:    addpd %xmm2, %xmm0
5769; SSE2-NEXT:    addpd %xmm2, %xmm1
5770; SSE2-NEXT:    movupd %xmm0, (%rdi)
5771; SSE2-NEXT:    movupd %xmm1, 16(%rdi)
5772; SSE2-NEXT:    retq
5773;
5774; SSE41-LABEL: PR43609:
5775; SSE41:       # %bb.0:
5776; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2,2]
5777; SSE41-NEXT:    paddq %xmm0, %xmm1
5778; SSE41-NEXT:    pxor %xmm2, %xmm2
5779; SSE41-NEXT:    movdqa %xmm0, %xmm3
5780; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
5781; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
5782; SSE41-NEXT:    por %xmm4, %xmm3
5783; SSE41-NEXT:    psrlq $32, %xmm0
5784; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
5785; SSE41-NEXT:    por %xmm5, %xmm0
5786; SSE41-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
5787; SSE41-NEXT:    subpd %xmm6, %xmm0
5788; SSE41-NEXT:    addpd %xmm3, %xmm0
5789; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
5790; SSE41-NEXT:    por %xmm4, %xmm2
5791; SSE41-NEXT:    psrlq $32, %xmm1
5792; SSE41-NEXT:    por %xmm5, %xmm1
5793; SSE41-NEXT:    subpd %xmm6, %xmm1
5794; SSE41-NEXT:    addpd %xmm2, %xmm1
5795; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
5796; SSE41-NEXT:    addpd %xmm2, %xmm0
5797; SSE41-NEXT:    addpd %xmm2, %xmm1
5798; SSE41-NEXT:    movupd %xmm0, (%rdi)
5799; SSE41-NEXT:    movupd %xmm1, 16(%rdi)
5800; SSE41-NEXT:    retq
5801;
5802; AVX1-LABEL: PR43609:
5803; AVX1:       # %bb.0:
5804; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm1
5805; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
5806; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
5807; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
5808; AVX1-NEXT:    vpor %xmm4, %xmm3, %xmm3
5809; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
5810; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
5811; AVX1-NEXT:    vpor %xmm5, %xmm0, %xmm0
5812; AVX1-NEXT:    vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
5813; AVX1-NEXT:    vsubpd %xmm6, %xmm0, %xmm0
5814; AVX1-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
5815; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
5816; AVX1-NEXT:    vpor %xmm4, %xmm2, %xmm2
5817; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
5818; AVX1-NEXT:    vpor %xmm5, %xmm1, %xmm1
5819; AVX1-NEXT:    vsubpd %xmm6, %xmm1, %xmm1
5820; AVX1-NEXT:    vaddpd %xmm1, %xmm2, %xmm1
5821; AVX1-NEXT:    vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
5822; AVX1-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
5823; AVX1-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
5824; AVX1-NEXT:    vmovupd %xmm0, (%rdi)
5825; AVX1-NEXT:    vmovupd %xmm1, 16(%rdi)
5826; AVX1-NEXT:    retq
5827;
5828; AVX2-LABEL: PR43609:
5829; AVX2:       # %bb.0:
5830; AVX2-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm1
5831; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
5832; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
5833; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
5834; AVX2-NEXT:    vpor %xmm4, %xmm3, %xmm3
5835; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
5836; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
5837; AVX2-NEXT:    vpor %xmm5, %xmm0, %xmm0
5838; AVX2-NEXT:    vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
5839; AVX2-NEXT:    vsubpd %xmm6, %xmm0, %xmm0
5840; AVX2-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
5841; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
5842; AVX2-NEXT:    vpor %xmm4, %xmm2, %xmm2
5843; AVX2-NEXT:    vpsrlq $32, %xmm1, %xmm1
5844; AVX2-NEXT:    vpor %xmm5, %xmm1, %xmm1
5845; AVX2-NEXT:    vsubpd %xmm6, %xmm1, %xmm1
5846; AVX2-NEXT:    vaddpd %xmm1, %xmm2, %xmm1
5847; AVX2-NEXT:    vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
5848; AVX2-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
5849; AVX2-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
5850; AVX2-NEXT:    vmovupd %xmm0, (%rdi)
5851; AVX2-NEXT:    vmovupd %xmm1, 16(%rdi)
5852; AVX2-NEXT:    retq
5853;
5854; AVX512F-LABEL: PR43609:
5855; AVX512F:       # %bb.0:
5856; AVX512F-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm1
5857; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
5858; AVX512F-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
5859; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
5860; AVX512F-NEXT:    vpor %xmm4, %xmm3, %xmm3
5861; AVX512F-NEXT:    vpsrlq $32, %xmm0, %xmm0
5862; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
5863; AVX512F-NEXT:    vpor %xmm5, %xmm0, %xmm0
5864; AVX512F-NEXT:    vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
5865; AVX512F-NEXT:    vsubpd %xmm6, %xmm0, %xmm0
5866; AVX512F-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
5867; AVX512F-NEXT:    vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
5868; AVX512F-NEXT:    vpor %xmm4, %xmm2, %xmm2
5869; AVX512F-NEXT:    vpsrlq $32, %xmm1, %xmm1
5870; AVX512F-NEXT:    vpor %xmm5, %xmm1, %xmm1
5871; AVX512F-NEXT:    vsubpd %xmm6, %xmm1, %xmm1
5872; AVX512F-NEXT:    vaddpd %xmm1, %xmm2, %xmm1
5873; AVX512F-NEXT:    vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
5874; AVX512F-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
5875; AVX512F-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
5876; AVX512F-NEXT:    vmovupd %xmm0, (%rdi)
5877; AVX512F-NEXT:    vmovupd %xmm1, 16(%rdi)
5878; AVX512F-NEXT:    retq
5879;
5880; AVX512VL-LABEL: PR43609:
5881; AVX512VL:       # %bb.0:
5882; AVX512VL-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm1
5883; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
5884; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
5885; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
5886; AVX512VL-NEXT:    vpor %xmm4, %xmm3, %xmm3
5887; AVX512VL-NEXT:    vpsrlq $32, %xmm0, %xmm0
5888; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
5889; AVX512VL-NEXT:    vpor %xmm5, %xmm0, %xmm0
5890; AVX512VL-NEXT:    vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
5891; AVX512VL-NEXT:    vsubpd %xmm6, %xmm0, %xmm0
5892; AVX512VL-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
5893; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
5894; AVX512VL-NEXT:    vpor %xmm4, %xmm2, %xmm2
5895; AVX512VL-NEXT:    vpsrlq $32, %xmm1, %xmm1
5896; AVX512VL-NEXT:    vpor %xmm5, %xmm1, %xmm1
5897; AVX512VL-NEXT:    vsubpd %xmm6, %xmm1, %xmm1
5898; AVX512VL-NEXT:    vaddpd %xmm1, %xmm2, %xmm1
5899; AVX512VL-NEXT:    vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
5900; AVX512VL-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
5901; AVX512VL-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
5902; AVX512VL-NEXT:    vmovupd %xmm0, (%rdi)
5903; AVX512VL-NEXT:    vmovupd %xmm1, 16(%rdi)
5904; AVX512VL-NEXT:    retq
5905;
5906; AVX512DQ-LABEL: PR43609:
5907; AVX512DQ:       # %bb.0:
5908; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
5909; AVX512DQ-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm1
5910; AVX512DQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
5911; AVX512DQ-NEXT:    vcvtuqq2pd %zmm1, %zmm1
5912; AVX512DQ-NEXT:    vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
5913; AVX512DQ-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
5914; AVX512DQ-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
5915; AVX512DQ-NEXT:    vmovupd %xmm0, (%rdi)
5916; AVX512DQ-NEXT:    vmovupd %xmm1, 16(%rdi)
5917; AVX512DQ-NEXT:    vzeroupper
5918; AVX512DQ-NEXT:    retq
5919;
5920; AVX512VLDQ-LABEL: PR43609:
5921; AVX512VLDQ:       # %bb.0:
5922; AVX512VLDQ-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm1
5923; AVX512VLDQ-NEXT:    vcvtuqq2pd %xmm0, %xmm0
5924; AVX512VLDQ-NEXT:    vcvtuqq2pd %xmm1, %xmm1
5925; AVX512VLDQ-NEXT:    vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
5926; AVX512VLDQ-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
5927; AVX512VLDQ-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
5928; AVX512VLDQ-NEXT:    vmovupd %xmm0, (%rdi)
5929; AVX512VLDQ-NEXT:    vmovupd %xmm1, 16(%rdi)
5930; AVX512VLDQ-NEXT:    retq
5931  %step.add.epil = add <2 x i64> %y, <i64 2, i64 2>
5932  %t20 = uitofp <2 x i64> %y to <2 x double>
5933  %t21 = uitofp <2 x i64> %step.add.epil to <2 x double>
5934  %t22 = fadd fast <2 x double> %t20, <double 5.0e-01, double 5.0e-01>
5935  %t23 = fadd fast <2 x double> %t21, <double 5.0e-01, double 5.0e-01>
5936  %t24 = getelementptr inbounds double, double* %x, i64 0
5937  %t25 = bitcast double* %t24 to <2 x double>*
5938  store <2 x double> %t22, <2 x double>* %t25, align 8
5939  %t26 = getelementptr inbounds double, double* %t24, i64 2
5940  %t27 = bitcast double* %t26 to <2 x double>*
5941  store <2 x double> %t23, <2 x double>* %t27, align 8
5942  ret void
5943}
5944
5945attributes #0 = { "unsafe-fp-math"="true" }
5946
5947