1; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math | FileCheck %s --check-prefix=CST --check-prefix=SSE2
2; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+sse4.1 | FileCheck %s --check-prefix=CST --check-prefix=SSE41
3; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx | FileCheck %s --check-prefix=CST --check-prefix=AVX
4; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
5; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
6; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL
7
8; Check that the constant used in the vectors are the right ones.
9; SSE2: [[MASKCSTADDR:.LCPI[0-9_]+]]:
10; SSE2-NEXT: .long 65535 # 0xffff
11; SSE2-NEXT: .long 65535 # 0xffff
12; SSE2-NEXT: .long 65535 # 0xffff
13; SSE2-NEXT: .long 65535 # 0xffff
14
15; CST: [[LOWCSTADDR:.LCPI[0-9_]+]]:
16; CST-NEXT: .long 1258291200 # 0x4b000000
17; CST-NEXT: .long 1258291200 # 0x4b000000
18; CST-NEXT: .long 1258291200 # 0x4b000000
19; CST-NEXT: .long 1258291200 # 0x4b000000
20
21; CST: [[HIGHCSTADDR:.LCPI[0-9_]+]]:
22; CST-NEXT: .long 1392508928 # 0x53000000
23; CST-NEXT: .long 1392508928 # 0x53000000
24; CST-NEXT: .long 1392508928 # 0x53000000
25; CST-NEXT: .long 1392508928 # 0x53000000
26
27; CST: [[MAGICCSTADDR:.LCPI[0-9_]+]]:
28; CST-NEXT: .long 0x53000080 # float 5.49764202E+11
29; CST-NEXT: .long 0x53000080 # float 5.49764202E+11
30; CST-NEXT: .long 0x53000080 # float 5.49764202E+11
31; CST-NEXT: .long 0x53000080 # float 5.49764202E+11
32
33; AVX2: [[LOWCSTADDR:.LCPI[0-9_]+]]:
34; AVX2-NEXT: .long 1258291200 # 0x4b000000
35
36; AVX2: [[HIGHCSTADDR:.LCPI[0-9_]+]]:
37; AVX2-NEXT: .long 1392508928 # 0x53000000
38
39; AVX2: [[MAGICCSTADDR:.LCPI[0-9_]+]]:
40; AVX2-NEXT: .long 0x53000080 # float 5.49764202E+11
41
42define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) {
43; SSE2-LABEL: test_uitofp_v4i32_to_v4f32:
44; SSE2: movdqa [[MASKCSTADDR]](%rip), [[MASK:%xmm[0-9]+]]
45; SSE2-NEXT: pand %xmm0, [[MASK]]
46; After this instruction, MASK will have the value of the low parts
47; of the vector.
48; SSE2-NEXT: por [[LOWCSTADDR]](%rip), [[MASK]]
49; SSE2-NEXT: psrld $16, %xmm0
50; SSE2-NEXT: por [[HIGHCSTADDR]](%rip), %xmm0
51; SSE2-NEXT: subps [[MAGICCSTADDR]](%rip), %xmm0
52; SSE2-NEXT: addps [[MASK]], %xmm0
53; SSE2-NEXT: retq
54;
55; Currently we commute the arguments of the first blend, but this could be
56; improved to match the lowering of the second blend.
57; SSE41-LABEL: test_uitofp_v4i32_to_v4f32:
58; SSE41: movdqa [[LOWCSTADDR]](%rip), [[LOWVEC:%xmm[0-9]+]]
59; SSE41-NEXT: pblendw $85, %xmm0, [[LOWVEC]]
60; SSE41-NEXT: psrld $16, %xmm0
61; SSE41-NEXT: pblendw $170, [[HIGHCSTADDR]](%rip), %xmm0
62; SSE41-NEXT: subps [[MAGICCSTADDR]](%rip), %xmm0
63; SSE41-NEXT: addps [[LOWVEC]], %xmm0
64; SSE41-NEXT: retq
65;
66; AVX-LABEL: test_uitofp_v4i32_to_v4f32:
67; AVX: vpblendw $170, [[LOWCSTADDR]](%rip), %xmm0, [[LOWVEC:%xmm[0-9]+]]
68; AVX-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]]
69; AVX-NEXT: vpblendw $170, [[HIGHCSTADDR]](%rip), [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]]
70; AVX-NEXT: vsubps [[MAGICCSTADDR]](%rip), [[HIGHVEC]], [[TMP:%xmm[0-9]+]]
71; AVX-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0
72; AVX-NEXT: retq
73;
74; The lowering for AVX2 is a bit messy, because we select broadcast
75; instructions, instead of folding the constant loads.
76; AVX2-LABEL: test_uitofp_v4i32_to_v4f32:
77; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%xmm[0-9]+]]
78; AVX2-NEXT: vpblendw $170, [[LOWCST]], %xmm0, [[LOWVEC:%xmm[0-9]+]]
79; AVX2-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]]
80; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%xmm[0-9]+]]
81; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]]
82; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%xmm[0-9]+]]
83; AVX2-NEXT: vsubps [[MAGICCST]], [[HIGHVEC]], [[TMP:%xmm[0-9]+]]
84; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0
85; AVX2-NEXT: retq
86;
87; AVX512F-LABEL: test_uitofp_v4i32_to_v4f32:
88; AVX512F:       # %bb.0:
89; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
90; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
91; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
92; AVX512F-NEXT:    vzeroupper
93; AVX512F-NEXT:    retq
94;
95; AVX512VL-LABEL: test_uitofp_v4i32_to_v4f32:
96; AVX512VL:       # %bb.0:
97; AVX512VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
98; AVX512VL-NEXT:    retq
99  %tmp = uitofp <4 x i32> %arg to <4 x float>
100  ret <4 x float> %tmp
101}
102
103; Match the AVX2 constants used in the next function
104; AVX2: [[LOWCSTADDR:.LCPI[0-9_]+]]:
105; AVX2-NEXT: .long 1258291200 # 0x4b000000
106
107; AVX2: [[HIGHCSTADDR:.LCPI[0-9_]+]]:
108; AVX2-NEXT: .long 1392508928 # 0x53000000
109
110; AVX2: [[MAGICCSTADDR:.LCPI[0-9_]+]]:
111; AVX2-NEXT: .long 0x53000080 # float 5.49764202E+11
112
113define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) {
114; Legalization will break the thing is 2 x <4 x i32> on anthing prior AVX.
115; The constant used for in the vector instruction are shared between the
116; two sequences of instructions.
117;
118; SSE2-LABEL: test_uitofp_v8i32_to_v8f32:
119; SSE2: movdqa {{.*#+}} [[MASK:xmm[0-9]+]] = [65535,65535,65535,65535]
120; SSE2-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]]
121; SSE2-NEXT: pand %[[MASK]], [[VECLOW]]
122; SSE2-NEXT: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200]
123; SSE2-NEXT: por %[[LOWCST]], [[VECLOW]]
124; SSE2-NEXT: psrld $16, %xmm0
125; SSE2-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928]
126; SSE2-NEXT: por %[[HIGHCST]], %xmm0
127; SSE2-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
128; SSE2-NEXT: subps %[[MAGICCST]], %xmm0
129; SSE2-NEXT: addps [[VECLOW]], %xmm0
130; MASK is the low vector of the second part after this point.
131; SSE2-NEXT: pand %xmm1, %[[MASK]]
132; SSE2-NEXT: por %[[LOWCST]], %[[MASK]]
133; SSE2-NEXT: psrld $16, %xmm1
134; SSE2-NEXT: por %[[HIGHCST]], %xmm1
135; SSE2-NEXT: subps %[[MAGICCST]], %xmm1
136; SSE2-NEXT: addps %[[MASK]], %xmm1
137; SSE2-NEXT: retq
138;
139; SSE41-LABEL: test_uitofp_v8i32_to_v8f32:
140; SSE41: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200]
141; SSE41-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]]
142; SSE41-NEXT: pblendw $170, %[[LOWCST]], [[VECLOW]]
143; SSE41-NEXT: psrld $16, %xmm0
144; SSE41-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928]
145; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm0
146; SSE41-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
147; SSE41-NEXT: subps %[[MAGICCST]], %xmm0
148; SSE41-NEXT: addps [[VECLOW]], %xmm0
149; LOWCST is the low vector of the second part after this point.
150; The operands of the blend are inverted because we reuse xmm1
151; in the next shift.
152; SSE41-NEXT: pblendw $85, %xmm1, %[[LOWCST]]
153; SSE41-NEXT: psrld $16, %xmm1
154; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm1
155; SSE41-NEXT: subps %[[MAGICCST]], %xmm1
156; SSE41-NEXT: addps %[[LOWCST]], %xmm1
157; SSE41-NEXT: retq
158;
159; Test that we are not lowering uinttofp to scalars
160; AVX-NOT: cvtsd2ss
161; AVX: retq
162;
163; AVX2-LABEL: test_uitofp_v8i32_to_v8f32:
164; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%ymm[0-9]+]]
165; AVX2-NEXT: vpblendw $170, [[LOWCST]], %ymm0, [[LOWVEC:%ymm[0-9]+]]
166; AVX2-NEXT: vpsrld $16, %ymm0, [[SHIFTVEC:%ymm[0-9]+]]
167; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%ymm[0-9]+]]
168; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%ymm[0-9]+]]
169; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%ymm[0-9]+]]
170; AVX2-NEXT: vsubps [[MAGICCST]], [[HIGHVEC]], [[TMP:%ymm[0-9]+]]
171; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %ymm0
172; AVX2-NEXT: retq
173;
174; AVX512F-LABEL: test_uitofp_v8i32_to_v8f32:
175; AVX512F:       # %bb.0:
176; AVX512F-NEXT:    # kill
177; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
178; AVX512F-NEXT:    # kill
179; AVX512F-NEXT:    retq
180;
181; AVX512VL-LABEL: test_uitofp_v8i32_to_v8f32:
182; AVX512VL:       # %bb.0:
183; AVX512VL-NEXT:    vcvtudq2ps %ymm0, %ymm0
184; AVX512VL-NEXT:    retq
185  %tmp = uitofp <8 x i32> %arg to <8 x float>
186  ret <8 x float> %tmp
187}
188