1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512
5
6;
7; Half to Float
8;
9
10define float @cvt_i16_to_f32(i16 %a0) {
11; ALL-LABEL: cvt_i16_to_f32:
12; ALL:       # BB#0:
13; ALL-NEXT:    movswl %di, %eax
14; ALL-NEXT:    vmovd %eax, %xmm0
15; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
16; ALL-NEXT:    retq
17  %1 = bitcast i16 %a0 to half
18  %2 = fpext half %1 to float
19  ret float %2
20}
21
22define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) {
23; ALL-LABEL: cvt_4i16_to_4f32:
24; ALL:       # BB#0:
25; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
26; ALL-NEXT:    vmovq %xmm0, %rax
27; ALL-NEXT:    movq %rax, %rcx
28; ALL-NEXT:    movq %rax, %rdx
29; ALL-NEXT:    movswl %ax, %esi
30; ALL-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
31; ALL-NEXT:    shrl $16, %eax
32; ALL-NEXT:    shrq $32, %rcx
33; ALL-NEXT:    shrq $48, %rdx
34; ALL-NEXT:    movswl %dx, %edx
35; ALL-NEXT:    vmovd %edx, %xmm0
36; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
37; ALL-NEXT:    movswl %cx, %ecx
38; ALL-NEXT:    vmovd %ecx, %xmm1
39; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
40; ALL-NEXT:    cwtl
41; ALL-NEXT:    vmovd %eax, %xmm2
42; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
43; ALL-NEXT:    vmovd %esi, %xmm3
44; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
45; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
46; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
47; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
48; ALL-NEXT:    retq
49  %1 = bitcast <4 x i16> %a0 to <4 x half>
50  %2 = fpext <4 x half> %1 to <4 x float>
51  ret <4 x float> %2
52}
53
54define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) {
55; ALL-LABEL: cvt_8i16_to_4f32:
56; ALL:       # BB#0:
57; ALL-NEXT:    vmovq %xmm0, %rax
58; ALL-NEXT:    movq %rax, %rcx
59; ALL-NEXT:    movq %rax, %rdx
60; ALL-NEXT:    movswl %ax, %esi
61; ALL-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
62; ALL-NEXT:    shrl $16, %eax
63; ALL-NEXT:    shrq $32, %rcx
64; ALL-NEXT:    shrq $48, %rdx
65; ALL-NEXT:    movswl %dx, %edx
66; ALL-NEXT:    vmovd %edx, %xmm0
67; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
68; ALL-NEXT:    movswl %cx, %ecx
69; ALL-NEXT:    vmovd %ecx, %xmm1
70; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
71; ALL-NEXT:    cwtl
72; ALL-NEXT:    vmovd %eax, %xmm2
73; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
74; ALL-NEXT:    vmovd %esi, %xmm3
75; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
76; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
77; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
78; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
79; ALL-NEXT:    retq
80  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
81  %2 = bitcast <4 x i16> %1 to <4 x half>
82  %3 = fpext <4 x half> %2 to <4 x float>
83  ret <4 x float> %3
84}
85
86define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) {
87; AVX1-LABEL: cvt_8i16_to_8f32:
88; AVX1:       # BB#0:
89; AVX1-NEXT:    vpextrq $1, %xmm0, %rdx
90; AVX1-NEXT:    movq %rdx, %r8
91; AVX1-NEXT:    movq %rdx, %r10
92; AVX1-NEXT:    movswl %dx, %r9d
93; AVX1-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<kill>
94; AVX1-NEXT:    shrl $16, %edx
95; AVX1-NEXT:    shrq $32, %r8
96; AVX1-NEXT:    shrq $48, %r10
97; AVX1-NEXT:    vmovq %xmm0, %rdi
98; AVX1-NEXT:    movq %rdi, %rax
99; AVX1-NEXT:    movq %rdi, %rsi
100; AVX1-NEXT:    movswl %di, %ecx
101; AVX1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<kill>
102; AVX1-NEXT:    shrl $16, %edi
103; AVX1-NEXT:    shrq $32, %rax
104; AVX1-NEXT:    shrq $48, %rsi
105; AVX1-NEXT:    movswl %si, %esi
106; AVX1-NEXT:    vmovd %esi, %xmm0
107; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
108; AVX1-NEXT:    cwtl
109; AVX1-NEXT:    vmovd %eax, %xmm1
110; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
111; AVX1-NEXT:    movswl %di, %eax
112; AVX1-NEXT:    vmovd %eax, %xmm2
113; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
114; AVX1-NEXT:    vmovd %ecx, %xmm3
115; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
116; AVX1-NEXT:    movswl %r10w, %eax
117; AVX1-NEXT:    vmovd %eax, %xmm4
118; AVX1-NEXT:    vcvtph2ps %xmm4, %xmm4
119; AVX1-NEXT:    movswl %r8w, %eax
120; AVX1-NEXT:    vmovd %eax, %xmm5
121; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
122; AVX1-NEXT:    movswl %dx, %eax
123; AVX1-NEXT:    vmovd %eax, %xmm6
124; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
125; AVX1-NEXT:    vmovd %r9d, %xmm7
126; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
127; AVX1-NEXT:    vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
128; AVX1-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
129; AVX1-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
130; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
131; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
132; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
133; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
134; AVX1-NEXT:    retq
135;
136; AVX2-LABEL: cvt_8i16_to_8f32:
137; AVX2:       # BB#0:
138; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
139; AVX2-NEXT:    movq %rdx, %r8
140; AVX2-NEXT:    movq %rdx, %r10
141; AVX2-NEXT:    movswl %dx, %r9d
142; AVX2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<kill>
143; AVX2-NEXT:    shrl $16, %edx
144; AVX2-NEXT:    shrq $32, %r8
145; AVX2-NEXT:    shrq $48, %r10
146; AVX2-NEXT:    vmovq %xmm0, %rdi
147; AVX2-NEXT:    movq %rdi, %rax
148; AVX2-NEXT:    movq %rdi, %rsi
149; AVX2-NEXT:    movswl %di, %ecx
150; AVX2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<kill>
151; AVX2-NEXT:    shrl $16, %edi
152; AVX2-NEXT:    shrq $32, %rax
153; AVX2-NEXT:    shrq $48, %rsi
154; AVX2-NEXT:    movswl %si, %esi
155; AVX2-NEXT:    vmovd %esi, %xmm0
156; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
157; AVX2-NEXT:    cwtl
158; AVX2-NEXT:    vmovd %eax, %xmm1
159; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
160; AVX2-NEXT:    movswl %di, %eax
161; AVX2-NEXT:    vmovd %eax, %xmm2
162; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
163; AVX2-NEXT:    vmovd %ecx, %xmm3
164; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
165; AVX2-NEXT:    movswl %r10w, %eax
166; AVX2-NEXT:    vmovd %eax, %xmm4
167; AVX2-NEXT:    vcvtph2ps %xmm4, %xmm4
168; AVX2-NEXT:    movswl %r8w, %eax
169; AVX2-NEXT:    vmovd %eax, %xmm5
170; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
171; AVX2-NEXT:    movswl %dx, %eax
172; AVX2-NEXT:    vmovd %eax, %xmm6
173; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
174; AVX2-NEXT:    vmovd %r9d, %xmm7
175; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
176; AVX2-NEXT:    vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
177; AVX2-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
178; AVX2-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
179; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
180; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
181; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
182; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
183; AVX2-NEXT:    retq
184;
185; AVX512-LABEL: cvt_8i16_to_8f32:
186; AVX512:       # BB#0:
187; AVX512-NEXT:    vpextrq $1, %xmm0, %rdx
188; AVX512-NEXT:    movq %rdx, %r8
189; AVX512-NEXT:    movq %rdx, %r10
190; AVX512-NEXT:    movswl %dx, %r9d
191; AVX512-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<kill>
192; AVX512-NEXT:    shrl $16, %edx
193; AVX512-NEXT:    shrq $32, %r8
194; AVX512-NEXT:    shrq $48, %r10
195; AVX512-NEXT:    vmovq %xmm0, %rdi
196; AVX512-NEXT:    movq %rdi, %rax
197; AVX512-NEXT:    movq %rdi, %rsi
198; AVX512-NEXT:    movswl %di, %ecx
199; AVX512-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<kill>
200; AVX512-NEXT:    shrl $16, %edi
201; AVX512-NEXT:    shrq $32, %rax
202; AVX512-NEXT:    shrq $48, %rsi
203; AVX512-NEXT:    movswl %si, %esi
204; AVX512-NEXT:    vmovd %esi, %xmm0
205; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
206; AVX512-NEXT:    cwtl
207; AVX512-NEXT:    vmovd %eax, %xmm1
208; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
209; AVX512-NEXT:    movswl %di, %eax
210; AVX512-NEXT:    vmovd %eax, %xmm2
211; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
212; AVX512-NEXT:    vmovd %ecx, %xmm3
213; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
214; AVX512-NEXT:    movswl %r10w, %eax
215; AVX512-NEXT:    vmovd %eax, %xmm4
216; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
217; AVX512-NEXT:    movswl %r8w, %eax
218; AVX512-NEXT:    vmovd %eax, %xmm5
219; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
220; AVX512-NEXT:    movswl %dx, %eax
221; AVX512-NEXT:    vmovd %eax, %xmm6
222; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm6
223; AVX512-NEXT:    vmovd %r9d, %xmm7
224; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm7
225; AVX512-NEXT:    vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
226; AVX512-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
227; AVX512-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
228; AVX512-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
229; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
230; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
231; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
232; AVX512-NEXT:    retq
233  %1 = bitcast <8 x i16> %a0 to <8 x half>
234  %2 = fpext <8 x half> %1 to <8 x float>
235  ret <8 x float> %2
236}
237
238define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) {
239; AVX1-LABEL: cvt_16i16_to_16f32:
240; AVX1:       # BB#0:
241; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
242; AVX1-NEXT:    vmovq %xmm4, %rax
243; AVX1-NEXT:    movq %rax, %rcx
244; AVX1-NEXT:    shrq $48, %rcx
245; AVX1-NEXT:    movswl %cx, %ecx
246; AVX1-NEXT:    vmovd %ecx, %xmm8
247; AVX1-NEXT:    movq %rax, %rcx
248; AVX1-NEXT:    shrq $32, %rcx
249; AVX1-NEXT:    movswl %cx, %ecx
250; AVX1-NEXT:    vmovd %ecx, %xmm9
251; AVX1-NEXT:    movswl %ax, %ecx
252; AVX1-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
253; AVX1-NEXT:    shrl $16, %eax
254; AVX1-NEXT:    cwtl
255; AVX1-NEXT:    vmovd %eax, %xmm10
256; AVX1-NEXT:    vpextrq $1, %xmm4, %rax
257; AVX1-NEXT:    vmovd %ecx, %xmm11
258; AVX1-NEXT:    movq %rax, %rcx
259; AVX1-NEXT:    shrq $48, %rcx
260; AVX1-NEXT:    movswl %cx, %ecx
261; AVX1-NEXT:    vmovd %ecx, %xmm12
262; AVX1-NEXT:    movq %rax, %rcx
263; AVX1-NEXT:    shrq $32, %rcx
264; AVX1-NEXT:    movswl %cx, %ecx
265; AVX1-NEXT:    vmovd %ecx, %xmm13
266; AVX1-NEXT:    movswl %ax, %ecx
267; AVX1-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
268; AVX1-NEXT:    shrl $16, %eax
269; AVX1-NEXT:    cwtl
270; AVX1-NEXT:    vmovd %eax, %xmm14
271; AVX1-NEXT:    vmovq %xmm0, %rax
272; AVX1-NEXT:    vmovd %ecx, %xmm15
273; AVX1-NEXT:    movq %rax, %rcx
274; AVX1-NEXT:    shrq $48, %rcx
275; AVX1-NEXT:    movswl %cx, %ecx
276; AVX1-NEXT:    vmovd %ecx, %xmm2
277; AVX1-NEXT:    movq %rax, %rcx
278; AVX1-NEXT:    shrq $32, %rcx
279; AVX1-NEXT:    movswl %cx, %ecx
280; AVX1-NEXT:    vmovd %ecx, %xmm3
281; AVX1-NEXT:    movswl %ax, %ecx
282; AVX1-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
283; AVX1-NEXT:    shrl $16, %eax
284; AVX1-NEXT:    cwtl
285; AVX1-NEXT:    vmovd %eax, %xmm4
286; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
287; AVX1-NEXT:    vmovd %ecx, %xmm0
288; AVX1-NEXT:    movq %rax, %rcx
289; AVX1-NEXT:    shrq $48, %rcx
290; AVX1-NEXT:    movswl %cx, %ecx
291; AVX1-NEXT:    vmovd %ecx, %xmm5
292; AVX1-NEXT:    movq %rax, %rcx
293; AVX1-NEXT:    shrq $32, %rcx
294; AVX1-NEXT:    movswl %cx, %ecx
295; AVX1-NEXT:    vmovd %ecx, %xmm6
296; AVX1-NEXT:    movl %eax, %ecx
297; AVX1-NEXT:    shrl $16, %ecx
298; AVX1-NEXT:    movswl %cx, %ecx
299; AVX1-NEXT:    vmovd %ecx, %xmm7
300; AVX1-NEXT:    cwtl
301; AVX1-NEXT:    vmovd %eax, %xmm1
302; AVX1-NEXT:    vcvtph2ps %xmm8, %xmm8
303; AVX1-NEXT:    vcvtph2ps %xmm9, %xmm9
304; AVX1-NEXT:    vcvtph2ps %xmm10, %xmm10
305; AVX1-NEXT:    vcvtph2ps %xmm11, %xmm11
306; AVX1-NEXT:    vcvtph2ps %xmm12, %xmm12
307; AVX1-NEXT:    vcvtph2ps %xmm13, %xmm13
308; AVX1-NEXT:    vcvtph2ps %xmm14, %xmm14
309; AVX1-NEXT:    vcvtph2ps %xmm15, %xmm15
310; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
311; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
312; AVX1-NEXT:    vcvtph2ps %xmm4, %xmm4
313; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
314; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
315; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
316; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
317; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
318; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
319; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
320; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
321; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
322; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
323; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
324; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
325; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
326; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
327; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
328; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
329; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
330; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
331; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
332; AVX1-NEXT:    retq
333;
334; AVX2-LABEL: cvt_16i16_to_16f32:
335; AVX2:       # BB#0:
336; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
337; AVX2-NEXT:    vmovq %xmm4, %rax
338; AVX2-NEXT:    movq %rax, %rcx
339; AVX2-NEXT:    shrq $48, %rcx
340; AVX2-NEXT:    movswl %cx, %ecx
341; AVX2-NEXT:    vmovd %ecx, %xmm8
342; AVX2-NEXT:    movq %rax, %rcx
343; AVX2-NEXT:    shrq $32, %rcx
344; AVX2-NEXT:    movswl %cx, %ecx
345; AVX2-NEXT:    vmovd %ecx, %xmm9
346; AVX2-NEXT:    movswl %ax, %ecx
347; AVX2-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
348; AVX2-NEXT:    shrl $16, %eax
349; AVX2-NEXT:    cwtl
350; AVX2-NEXT:    vmovd %eax, %xmm10
351; AVX2-NEXT:    vpextrq $1, %xmm4, %rax
352; AVX2-NEXT:    vmovd %ecx, %xmm11
353; AVX2-NEXT:    movq %rax, %rcx
354; AVX2-NEXT:    shrq $48, %rcx
355; AVX2-NEXT:    movswl %cx, %ecx
356; AVX2-NEXT:    vmovd %ecx, %xmm12
357; AVX2-NEXT:    movq %rax, %rcx
358; AVX2-NEXT:    shrq $32, %rcx
359; AVX2-NEXT:    movswl %cx, %ecx
360; AVX2-NEXT:    vmovd %ecx, %xmm13
361; AVX2-NEXT:    movswl %ax, %ecx
362; AVX2-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
363; AVX2-NEXT:    shrl $16, %eax
364; AVX2-NEXT:    cwtl
365; AVX2-NEXT:    vmovd %eax, %xmm14
366; AVX2-NEXT:    vmovq %xmm0, %rax
367; AVX2-NEXT:    vmovd %ecx, %xmm15
368; AVX2-NEXT:    movq %rax, %rcx
369; AVX2-NEXT:    shrq $48, %rcx
370; AVX2-NEXT:    movswl %cx, %ecx
371; AVX2-NEXT:    vmovd %ecx, %xmm2
372; AVX2-NEXT:    movq %rax, %rcx
373; AVX2-NEXT:    shrq $32, %rcx
374; AVX2-NEXT:    movswl %cx, %ecx
375; AVX2-NEXT:    vmovd %ecx, %xmm3
376; AVX2-NEXT:    movswl %ax, %ecx
377; AVX2-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
378; AVX2-NEXT:    shrl $16, %eax
379; AVX2-NEXT:    cwtl
380; AVX2-NEXT:    vmovd %eax, %xmm4
381; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
382; AVX2-NEXT:    vmovd %ecx, %xmm0
383; AVX2-NEXT:    movq %rax, %rcx
384; AVX2-NEXT:    shrq $48, %rcx
385; AVX2-NEXT:    movswl %cx, %ecx
386; AVX2-NEXT:    vmovd %ecx, %xmm5
387; AVX2-NEXT:    movq %rax, %rcx
388; AVX2-NEXT:    shrq $32, %rcx
389; AVX2-NEXT:    movswl %cx, %ecx
390; AVX2-NEXT:    vmovd %ecx, %xmm6
391; AVX2-NEXT:    movl %eax, %ecx
392; AVX2-NEXT:    shrl $16, %ecx
393; AVX2-NEXT:    movswl %cx, %ecx
394; AVX2-NEXT:    vmovd %ecx, %xmm7
395; AVX2-NEXT:    cwtl
396; AVX2-NEXT:    vmovd %eax, %xmm1
397; AVX2-NEXT:    vcvtph2ps %xmm8, %xmm8
398; AVX2-NEXT:    vcvtph2ps %xmm9, %xmm9
399; AVX2-NEXT:    vcvtph2ps %xmm10, %xmm10
400; AVX2-NEXT:    vcvtph2ps %xmm11, %xmm11
401; AVX2-NEXT:    vcvtph2ps %xmm12, %xmm12
402; AVX2-NEXT:    vcvtph2ps %xmm13, %xmm13
403; AVX2-NEXT:    vcvtph2ps %xmm14, %xmm14
404; AVX2-NEXT:    vcvtph2ps %xmm15, %xmm15
405; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
406; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
407; AVX2-NEXT:    vcvtph2ps %xmm4, %xmm4
408; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
409; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
410; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
411; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
412; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
413; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
414; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
415; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
416; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
417; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
418; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
419; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
420; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
421; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
422; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
423; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
424; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
425; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
426; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
427; AVX2-NEXT:    retq
428;
429; AVX512-LABEL: cvt_16i16_to_16f32:
430; AVX512:       # BB#0:
431; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm10
432; AVX512-NEXT:    vmovq %xmm0, %rax
433; AVX512-NEXT:    movq %rax, %rcx
434; AVX512-NEXT:    shrq $48, %rcx
435; AVX512-NEXT:    movswl %cx, %ecx
436; AVX512-NEXT:    vmovd %ecx, %xmm8
437; AVX512-NEXT:    movq %rax, %rcx
438; AVX512-NEXT:    shrq $32, %rcx
439; AVX512-NEXT:    movswl %cx, %ecx
440; AVX512-NEXT:    vmovd %ecx, %xmm9
441; AVX512-NEXT:    movswl %ax, %ecx
442; AVX512-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
443; AVX512-NEXT:    shrl $16, %eax
444; AVX512-NEXT:    cwtl
445; AVX512-NEXT:    vmovd %eax, %xmm11
446; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
447; AVX512-NEXT:    vmovd %ecx, %xmm12
448; AVX512-NEXT:    movq %rax, %rcx
449; AVX512-NEXT:    shrq $48, %rcx
450; AVX512-NEXT:    movswl %cx, %ecx
451; AVX512-NEXT:    vmovd %ecx, %xmm13
452; AVX512-NEXT:    movq %rax, %rcx
453; AVX512-NEXT:    shrq $32, %rcx
454; AVX512-NEXT:    movswl %cx, %ecx
455; AVX512-NEXT:    vmovd %ecx, %xmm14
456; AVX512-NEXT:    movswl %ax, %ecx
457; AVX512-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
458; AVX512-NEXT:    shrl $16, %eax
459; AVX512-NEXT:    cwtl
460; AVX512-NEXT:    vmovd %eax, %xmm15
461; AVX512-NEXT:    vmovq %xmm10, %rax
462; AVX512-NEXT:    vmovd %ecx, %xmm2
463; AVX512-NEXT:    movq %rax, %rcx
464; AVX512-NEXT:    shrq $48, %rcx
465; AVX512-NEXT:    movswl %cx, %ecx
466; AVX512-NEXT:    vmovd %ecx, %xmm3
467; AVX512-NEXT:    movq %rax, %rcx
468; AVX512-NEXT:    shrq $32, %rcx
469; AVX512-NEXT:    movswl %cx, %ecx
470; AVX512-NEXT:    vmovd %ecx, %xmm1
471; AVX512-NEXT:    movswl %ax, %ecx
472; AVX512-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
473; AVX512-NEXT:    shrl $16, %eax
474; AVX512-NEXT:    cwtl
475; AVX512-NEXT:    vmovd %eax, %xmm4
476; AVX512-NEXT:    vpextrq $1, %xmm10, %rax
477; AVX512-NEXT:    vmovd %ecx, %xmm10
478; AVX512-NEXT:    movq %rax, %rcx
479; AVX512-NEXT:    shrq $48, %rcx
480; AVX512-NEXT:    movswl %cx, %ecx
481; AVX512-NEXT:    vmovd %ecx, %xmm5
482; AVX512-NEXT:    movq %rax, %rcx
483; AVX512-NEXT:    shrq $32, %rcx
484; AVX512-NEXT:    movswl %cx, %ecx
485; AVX512-NEXT:    vmovd %ecx, %xmm6
486; AVX512-NEXT:    movl %eax, %ecx
487; AVX512-NEXT:    shrl $16, %ecx
488; AVX512-NEXT:    movswl %cx, %ecx
489; AVX512-NEXT:    vmovd %ecx, %xmm7
490; AVX512-NEXT:    cwtl
491; AVX512-NEXT:    vmovd %eax, %xmm0
492; AVX512-NEXT:    vcvtph2ps %xmm8, %xmm8
493; AVX512-NEXT:    vcvtph2ps %xmm9, %xmm9
494; AVX512-NEXT:    vcvtph2ps %xmm11, %xmm11
495; AVX512-NEXT:    vcvtph2ps %xmm12, %xmm12
496; AVX512-NEXT:    vcvtph2ps %xmm13, %xmm13
497; AVX512-NEXT:    vcvtph2ps %xmm14, %xmm14
498; AVX512-NEXT:    vcvtph2ps %xmm15, %xmm15
499; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
500; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
501; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
502; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
503; AVX512-NEXT:    vcvtph2ps %xmm10, %xmm10
504; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
505; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm6
506; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm7
507; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
508; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3]
509; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3]
510; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0]
511; AVX512-NEXT:    vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3]
512; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3]
513; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
514; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
515; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3]
516; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
517; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
518; AVX512-NEXT:    vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
519; AVX512-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
520; AVX512-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
521; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
522; AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
523; AVX512-NEXT:    retq
524  %1 = bitcast <16 x i16> %a0 to <16 x half>
525  %2 = fpext <16 x half> %1 to <16 x float>
526  ret <16 x float> %2
527}
528
529;
530; Half to Float (Load)
531;
532
533define float @load_cvt_i16_to_f32(i16* %a0) {
534; ALL-LABEL: load_cvt_i16_to_f32:
535; ALL:       # BB#0:
536; ALL-NEXT:    movswl (%rdi), %eax
537; ALL-NEXT:    vmovd %eax, %xmm0
538; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
539; ALL-NEXT:    retq
540  %1 = load i16, i16* %a0
541  %2 = bitcast i16 %1 to half
542  %3 = fpext half %2 to float
543  ret float %3
544}
545
546define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) {
547; ALL-LABEL: load_cvt_4i16_to_4f32:
548; ALL:       # BB#0:
549; ALL-NEXT:    movswl 6(%rdi), %eax
550; ALL-NEXT:    vmovd %eax, %xmm0
551; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
552; ALL-NEXT:    movswl 4(%rdi), %eax
553; ALL-NEXT:    vmovd %eax, %xmm1
554; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
555; ALL-NEXT:    movswl (%rdi), %eax
556; ALL-NEXT:    vmovd %eax, %xmm2
557; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
558; ALL-NEXT:    movswl 2(%rdi), %eax
559; ALL-NEXT:    vmovd %eax, %xmm3
560; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
561; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
562; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
563; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
564; ALL-NEXT:    retq
565  %1 = load <4 x i16>, <4 x i16>* %a0
566  %2 = bitcast <4 x i16> %1 to <4 x half>
567  %3 = fpext <4 x half> %2 to <4 x float>
568  ret <4 x float> %3
569}
570
571define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) {
572; ALL-LABEL: load_cvt_8i16_to_4f32:
573; ALL:       # BB#0:
574; ALL-NEXT:    movq (%rdi), %rax
575; ALL-NEXT:    movq %rax, %rcx
576; ALL-NEXT:    movq %rax, %rdx
577; ALL-NEXT:    movswl %ax, %esi
578; ALL-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
579; ALL-NEXT:    shrl $16, %eax
580; ALL-NEXT:    shrq $32, %rcx
581; ALL-NEXT:    shrq $48, %rdx
582; ALL-NEXT:    movswl %dx, %edx
583; ALL-NEXT:    vmovd %edx, %xmm0
584; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
585; ALL-NEXT:    movswl %cx, %ecx
586; ALL-NEXT:    vmovd %ecx, %xmm1
587; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
588; ALL-NEXT:    cwtl
589; ALL-NEXT:    vmovd %eax, %xmm2
590; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
591; ALL-NEXT:    vmovd %esi, %xmm3
592; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
593; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
594; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
595; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
596; ALL-NEXT:    retq
597  %1 = load <8 x i16>, <8 x i16>* %a0
598  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
599  %3 = bitcast <4 x i16> %2 to <4 x half>
600  %4 = fpext <4 x half> %3 to <4 x float>
601  ret <4 x float> %4
602}
603
604define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) {
605; AVX1-LABEL: load_cvt_8i16_to_8f32:
606; AVX1:       # BB#0:
607; AVX1-NEXT:    movswl 6(%rdi), %eax
608; AVX1-NEXT:    vmovd %eax, %xmm0
609; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
610; AVX1-NEXT:    movswl 4(%rdi), %eax
611; AVX1-NEXT:    vmovd %eax, %xmm1
612; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
613; AVX1-NEXT:    movswl (%rdi), %eax
614; AVX1-NEXT:    vmovd %eax, %xmm2
615; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
616; AVX1-NEXT:    movswl 2(%rdi), %eax
617; AVX1-NEXT:    vmovd %eax, %xmm3
618; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
619; AVX1-NEXT:    movswl 14(%rdi), %eax
620; AVX1-NEXT:    vmovd %eax, %xmm4
621; AVX1-NEXT:    vcvtph2ps %xmm4, %xmm4
622; AVX1-NEXT:    movswl 12(%rdi), %eax
623; AVX1-NEXT:    vmovd %eax, %xmm5
624; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
625; AVX1-NEXT:    movswl 8(%rdi), %eax
626; AVX1-NEXT:    vmovd %eax, %xmm6
627; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
628; AVX1-NEXT:    movswl 10(%rdi), %eax
629; AVX1-NEXT:    vmovd %eax, %xmm7
630; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
631; AVX1-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
632; AVX1-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
633; AVX1-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
634; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
635; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
636; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
637; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
638; AVX1-NEXT:    retq
639;
640; AVX2-LABEL: load_cvt_8i16_to_8f32:
641; AVX2:       # BB#0:
642; AVX2-NEXT:    movswl 6(%rdi), %eax
643; AVX2-NEXT:    vmovd %eax, %xmm0
644; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
645; AVX2-NEXT:    movswl 4(%rdi), %eax
646; AVX2-NEXT:    vmovd %eax, %xmm1
647; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
648; AVX2-NEXT:    movswl (%rdi), %eax
649; AVX2-NEXT:    vmovd %eax, %xmm2
650; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
651; AVX2-NEXT:    movswl 2(%rdi), %eax
652; AVX2-NEXT:    vmovd %eax, %xmm3
653; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
654; AVX2-NEXT:    movswl 14(%rdi), %eax
655; AVX2-NEXT:    vmovd %eax, %xmm4
656; AVX2-NEXT:    vcvtph2ps %xmm4, %xmm4
657; AVX2-NEXT:    movswl 12(%rdi), %eax
658; AVX2-NEXT:    vmovd %eax, %xmm5
659; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
660; AVX2-NEXT:    movswl 8(%rdi), %eax
661; AVX2-NEXT:    vmovd %eax, %xmm6
662; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
663; AVX2-NEXT:    movswl 10(%rdi), %eax
664; AVX2-NEXT:    vmovd %eax, %xmm7
665; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
666; AVX2-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
667; AVX2-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
668; AVX2-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
669; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
670; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
671; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
672; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
673; AVX2-NEXT:    retq
674;
675; AVX512-LABEL: load_cvt_8i16_to_8f32:
676; AVX512:       # BB#0:
677; AVX512-NEXT:    movswl 6(%rdi), %eax
678; AVX512-NEXT:    vmovd %eax, %xmm0
679; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
680; AVX512-NEXT:    movswl 4(%rdi), %eax
681; AVX512-NEXT:    vmovd %eax, %xmm1
682; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
683; AVX512-NEXT:    movswl (%rdi), %eax
684; AVX512-NEXT:    vmovd %eax, %xmm2
685; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
686; AVX512-NEXT:    movswl 2(%rdi), %eax
687; AVX512-NEXT:    vmovd %eax, %xmm3
688; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
689; AVX512-NEXT:    movswl 14(%rdi), %eax
690; AVX512-NEXT:    vmovd %eax, %xmm4
691; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
692; AVX512-NEXT:    movswl 12(%rdi), %eax
693; AVX512-NEXT:    vmovd %eax, %xmm5
694; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
695; AVX512-NEXT:    movswl 8(%rdi), %eax
696; AVX512-NEXT:    vmovd %eax, %xmm6
697; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm6
698; AVX512-NEXT:    movswl 10(%rdi), %eax
699; AVX512-NEXT:    vmovd %eax, %xmm7
700; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm7
701; AVX512-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
702; AVX512-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
703; AVX512-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
704; AVX512-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
705; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
706; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
707; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
708; AVX512-NEXT:    retq
709  %1 = load <8 x i16>, <8 x i16>* %a0
710  %2 = bitcast <8 x i16> %1 to <8 x half>
711  %3 = fpext <8 x half> %2 to <8 x float>
712  ret <8 x float> %3
713}
714
715define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) {
716; AVX1-LABEL: load_cvt_16i16_to_16f32:
717; AVX1:       # BB#0:
718; AVX1-NEXT:    movswl 22(%rdi), %eax
719; AVX1-NEXT:    vmovd %eax, %xmm0
720; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm8
721; AVX1-NEXT:    movswl 20(%rdi), %eax
722; AVX1-NEXT:    vmovd %eax, %xmm0
723; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm9
724; AVX1-NEXT:    movswl 16(%rdi), %eax
725; AVX1-NEXT:    vmovd %eax, %xmm0
726; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm10
727; AVX1-NEXT:    movswl 18(%rdi), %eax
728; AVX1-NEXT:    vmovd %eax, %xmm0
729; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm11
730; AVX1-NEXT:    movswl 30(%rdi), %eax
731; AVX1-NEXT:    vmovd %eax, %xmm0
732; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm12
733; AVX1-NEXT:    movswl 28(%rdi), %eax
734; AVX1-NEXT:    vmovd %eax, %xmm0
735; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm13
736; AVX1-NEXT:    movswl 24(%rdi), %eax
737; AVX1-NEXT:    vmovd %eax, %xmm0
738; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm14
739; AVX1-NEXT:    movswl 26(%rdi), %eax
740; AVX1-NEXT:    vmovd %eax, %xmm0
741; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm15
742; AVX1-NEXT:    movswl 6(%rdi), %eax
743; AVX1-NEXT:    vmovd %eax, %xmm0
744; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
745; AVX1-NEXT:    movswl 4(%rdi), %eax
746; AVX1-NEXT:    vmovd %eax, %xmm2
747; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
748; AVX1-NEXT:    movswl (%rdi), %eax
749; AVX1-NEXT:    vmovd %eax, %xmm3
750; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
751; AVX1-NEXT:    movswl 2(%rdi), %eax
752; AVX1-NEXT:    vmovd %eax, %xmm4
753; AVX1-NEXT:    vcvtph2ps %xmm4, %xmm4
754; AVX1-NEXT:    movswl 14(%rdi), %eax
755; AVX1-NEXT:    vmovd %eax, %xmm5
756; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
757; AVX1-NEXT:    movswl 12(%rdi), %eax
758; AVX1-NEXT:    vmovd %eax, %xmm6
759; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
760; AVX1-NEXT:    movswl 8(%rdi), %eax
761; AVX1-NEXT:    vmovd %eax, %xmm7
762; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
763; AVX1-NEXT:    movswl 10(%rdi), %eax
764; AVX1-NEXT:    vmovd %eax, %xmm1
765; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
766; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
767; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
768; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
769; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
770; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
771; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
772; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
773; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
774; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
775; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
776; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
777; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
778; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
779; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
780; AVX1-NEXT:    retq
781;
782; AVX2-LABEL: load_cvt_16i16_to_16f32:
783; AVX2:       # BB#0:
784; AVX2-NEXT:    movswl 22(%rdi), %eax
785; AVX2-NEXT:    vmovd %eax, %xmm0
786; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm8
787; AVX2-NEXT:    movswl 20(%rdi), %eax
788; AVX2-NEXT:    vmovd %eax, %xmm0
789; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm9
790; AVX2-NEXT:    movswl 16(%rdi), %eax
791; AVX2-NEXT:    vmovd %eax, %xmm0
792; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm10
793; AVX2-NEXT:    movswl 18(%rdi), %eax
794; AVX2-NEXT:    vmovd %eax, %xmm0
795; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm11
796; AVX2-NEXT:    movswl 30(%rdi), %eax
797; AVX2-NEXT:    vmovd %eax, %xmm0
798; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm12
799; AVX2-NEXT:    movswl 28(%rdi), %eax
800; AVX2-NEXT:    vmovd %eax, %xmm0
801; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm13
802; AVX2-NEXT:    movswl 24(%rdi), %eax
803; AVX2-NEXT:    vmovd %eax, %xmm0
804; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm14
805; AVX2-NEXT:    movswl 26(%rdi), %eax
806; AVX2-NEXT:    vmovd %eax, %xmm0
807; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm15
808; AVX2-NEXT:    movswl 6(%rdi), %eax
809; AVX2-NEXT:    vmovd %eax, %xmm0
810; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
811; AVX2-NEXT:    movswl 4(%rdi), %eax
812; AVX2-NEXT:    vmovd %eax, %xmm2
813; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
814; AVX2-NEXT:    movswl (%rdi), %eax
815; AVX2-NEXT:    vmovd %eax, %xmm3
816; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
817; AVX2-NEXT:    movswl 2(%rdi), %eax
818; AVX2-NEXT:    vmovd %eax, %xmm4
819; AVX2-NEXT:    vcvtph2ps %xmm4, %xmm4
820; AVX2-NEXT:    movswl 14(%rdi), %eax
821; AVX2-NEXT:    vmovd %eax, %xmm5
822; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
823; AVX2-NEXT:    movswl 12(%rdi), %eax
824; AVX2-NEXT:    vmovd %eax, %xmm6
825; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
826; AVX2-NEXT:    movswl 8(%rdi), %eax
827; AVX2-NEXT:    vmovd %eax, %xmm7
828; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
829; AVX2-NEXT:    movswl 10(%rdi), %eax
830; AVX2-NEXT:    vmovd %eax, %xmm1
831; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
832; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
833; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
834; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
835; AVX2-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
836; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
837; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
838; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
839; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
840; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
841; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
842; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
843; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
844; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
845; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
846; AVX2-NEXT:    retq
847;
848; AVX512-LABEL: load_cvt_16i16_to_16f32:
849; AVX512:       # BB#0:
850; AVX512-NEXT:    movswl 6(%rdi), %eax
851; AVX512-NEXT:    vmovd %eax, %xmm0
852; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm8
853; AVX512-NEXT:    movswl 4(%rdi), %eax
854; AVX512-NEXT:    vmovd %eax, %xmm0
855; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm9
856; AVX512-NEXT:    movswl (%rdi), %eax
857; AVX512-NEXT:    vmovd %eax, %xmm0
858; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm10
859; AVX512-NEXT:    movswl 2(%rdi), %eax
860; AVX512-NEXT:    vmovd %eax, %xmm0
861; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm11
862; AVX512-NEXT:    movswl 14(%rdi), %eax
863; AVX512-NEXT:    vmovd %eax, %xmm0
864; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm12
865; AVX512-NEXT:    movswl 12(%rdi), %eax
866; AVX512-NEXT:    vmovd %eax, %xmm0
867; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm13
868; AVX512-NEXT:    movswl 8(%rdi), %eax
869; AVX512-NEXT:    vmovd %eax, %xmm0
870; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm14
871; AVX512-NEXT:    movswl 10(%rdi), %eax
872; AVX512-NEXT:    vmovd %eax, %xmm0
873; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm15
874; AVX512-NEXT:    movswl 22(%rdi), %eax
875; AVX512-NEXT:    vmovd %eax, %xmm0
876; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
877; AVX512-NEXT:    movswl 20(%rdi), %eax
878; AVX512-NEXT:    vmovd %eax, %xmm1
879; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
880; AVX512-NEXT:    movswl 16(%rdi), %eax
881; AVX512-NEXT:    vmovd %eax, %xmm2
882; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
883; AVX512-NEXT:    movswl 18(%rdi), %eax
884; AVX512-NEXT:    vmovd %eax, %xmm3
885; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
886; AVX512-NEXT:    movswl 30(%rdi), %eax
887; AVX512-NEXT:    vmovd %eax, %xmm4
888; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
889; AVX512-NEXT:    movswl 28(%rdi), %eax
890; AVX512-NEXT:    vmovd %eax, %xmm5
891; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
892; AVX512-NEXT:    movswl 24(%rdi), %eax
893; AVX512-NEXT:    vmovd %eax, %xmm6
894; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm6
895; AVX512-NEXT:    movswl 26(%rdi), %eax
896; AVX512-NEXT:    vmovd %eax, %xmm7
897; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm7
898; AVX512-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
899; AVX512-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
900; AVX512-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
901; AVX512-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
902; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
903; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
904; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
905; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
906; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
907; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
908; AVX512-NEXT:    vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
909; AVX512-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
910; AVX512-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
911; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
912; AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
913; AVX512-NEXT:    retq
914  %1 = load <16 x i16>, <16 x i16>* %a0
915  %2 = bitcast <16 x i16> %1 to <16 x half>
916  %3 = fpext <16 x half> %2 to <16 x float>
917  ret <16 x float> %3
918}
919
920;
921; Half to Double
922;
923
924define double @cvt_i16_to_f64(i16 %a0) {
925; ALL-LABEL: cvt_i16_to_f64:
926; ALL:       # BB#0:
927; ALL-NEXT:    movswl %di, %eax
928; ALL-NEXT:    vmovd %eax, %xmm0
929; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
930; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
931; ALL-NEXT:    retq
932  %1 = bitcast i16 %a0 to half
933  %2 = fpext half %1 to double
934  ret double %2
935}
936
937define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) {
938; ALL-LABEL: cvt_2i16_to_2f64:
939; ALL:       # BB#0:
940; ALL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
941; ALL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
942; ALL-NEXT:    vmovd %xmm0, %eax
943; ALL-NEXT:    movswl %ax, %ecx
944; ALL-NEXT:    shrl $16, %eax
945; ALL-NEXT:    cwtl
946; ALL-NEXT:    vmovd %eax, %xmm0
947; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
948; ALL-NEXT:    vmovd %ecx, %xmm1
949; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
950; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
951; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
952; ALL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
953; ALL-NEXT:    retq
954  %1 = bitcast <2 x i16> %a0 to <2 x half>
955  %2 = fpext <2 x half> %1 to <2 x double>
956  ret <2 x double> %2
957}
958
959define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) {
960; ALL-LABEL: cvt_4i16_to_4f64:
961; ALL:       # BB#0:
962; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
963; ALL-NEXT:    vmovq %xmm0, %rax
964; ALL-NEXT:    movq %rax, %rcx
965; ALL-NEXT:    movl %eax, %edx
966; ALL-NEXT:    movswl %ax, %esi
967; ALL-NEXT:    shrq $48, %rax
968; ALL-NEXT:    shrq $32, %rcx
969; ALL-NEXT:    shrl $16, %edx
970; ALL-NEXT:    movswl %dx, %edx
971; ALL-NEXT:    vmovd %edx, %xmm0
972; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
973; ALL-NEXT:    vmovd %esi, %xmm1
974; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
975; ALL-NEXT:    movswl %cx, %ecx
976; ALL-NEXT:    vmovd %ecx, %xmm2
977; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
978; ALL-NEXT:    cwtl
979; ALL-NEXT:    vmovd %eax, %xmm3
980; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
981; ALL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
982; ALL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
983; ALL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
984; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
985; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
986; ALL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
987; ALL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
988; ALL-NEXT:    retq
989  %1 = bitcast <4 x i16> %a0 to <4 x half>
990  %2 = fpext <4 x half> %1 to <4 x double>
991  ret <4 x double> %2
992}
993
994define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) {
995; ALL-LABEL: cvt_8i16_to_2f64:
996; ALL:       # BB#0:
997; ALL-NEXT:    vmovd %xmm0, %eax
998; ALL-NEXT:    movswl %ax, %ecx
999; ALL-NEXT:    shrl $16, %eax
1000; ALL-NEXT:    cwtl
1001; ALL-NEXT:    vmovd %eax, %xmm0
1002; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
1003; ALL-NEXT:    vmovd %ecx, %xmm1
1004; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
1005; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1006; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1007; ALL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1008; ALL-NEXT:    retq
1009  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
1010  %2 = bitcast <2 x i16> %1 to <2 x half>
1011  %3 = fpext <2 x half> %2 to <2 x double>
1012  ret <2 x double> %3
1013}
1014
1015define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) {
1016; ALL-LABEL: cvt_8i16_to_4f64:
1017; ALL:       # BB#0:
1018; ALL-NEXT:    vmovq %xmm0, %rax
1019; ALL-NEXT:    movq %rax, %rcx
1020; ALL-NEXT:    movl %eax, %edx
1021; ALL-NEXT:    movswl %ax, %esi
1022; ALL-NEXT:    shrq $48, %rax
1023; ALL-NEXT:    shrq $32, %rcx
1024; ALL-NEXT:    shrl $16, %edx
1025; ALL-NEXT:    movswl %dx, %edx
1026; ALL-NEXT:    vmovd %edx, %xmm0
1027; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
1028; ALL-NEXT:    vmovd %esi, %xmm1
1029; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
1030; ALL-NEXT:    movswl %cx, %ecx
1031; ALL-NEXT:    vmovd %ecx, %xmm2
1032; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
1033; ALL-NEXT:    cwtl
1034; ALL-NEXT:    vmovd %eax, %xmm3
1035; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
1036; ALL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1037; ALL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1038; ALL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1039; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1040; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1041; ALL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1042; ALL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1043; ALL-NEXT:    retq
1044  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1045  %2 = bitcast <4 x i16> %1 to <4 x half>
1046  %3 = fpext <4 x half> %2 to <4 x double>
1047  ret <4 x double> %3
1048}
1049
1050define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) {
1051; AVX1-LABEL: cvt_8i16_to_8f64:
1052; AVX1:       # BB#0:
1053; AVX1-NEXT:    vmovq %xmm0, %rdx
1054; AVX1-NEXT:    movq %rdx, %r9
1055; AVX1-NEXT:    movl %edx, %r10d
1056; AVX1-NEXT:    movswl %dx, %r8d
1057; AVX1-NEXT:    shrq $48, %rdx
1058; AVX1-NEXT:    shrq $32, %r9
1059; AVX1-NEXT:    shrl $16, %r10d
1060; AVX1-NEXT:    vpextrq $1, %xmm0, %rdi
1061; AVX1-NEXT:    movq %rdi, %rsi
1062; AVX1-NEXT:    movl %edi, %eax
1063; AVX1-NEXT:    movswl %di, %ecx
1064; AVX1-NEXT:    shrq $48, %rdi
1065; AVX1-NEXT:    shrq $32, %rsi
1066; AVX1-NEXT:    shrl $16, %eax
1067; AVX1-NEXT:    cwtl
1068; AVX1-NEXT:    vmovd %eax, %xmm0
1069; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm1
1070; AVX1-NEXT:    vmovd %ecx, %xmm0
1071; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm2
1072; AVX1-NEXT:    movswl %si, %eax
1073; AVX1-NEXT:    vmovd %eax, %xmm0
1074; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm3
1075; AVX1-NEXT:    movswl %di, %eax
1076; AVX1-NEXT:    vmovd %eax, %xmm0
1077; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm4
1078; AVX1-NEXT:    movswl %r10w, %eax
1079; AVX1-NEXT:    vmovd %eax, %xmm0
1080; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
1081; AVX1-NEXT:    vmovd %r8d, %xmm5
1082; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
1083; AVX1-NEXT:    movswl %r9w, %eax
1084; AVX1-NEXT:    vmovd %eax, %xmm6
1085; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
1086; AVX1-NEXT:    movswl %dx, %eax
1087; AVX1-NEXT:    vmovd %eax, %xmm7
1088; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
1089; AVX1-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
1090; AVX1-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
1091; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1092; AVX1-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
1093; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1094; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm5[0],xmm0[0]
1095; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
1096; AVX1-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
1097; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1098; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1099; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1100; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1101; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1102; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
1103; AVX1-NEXT:    retq
1104;
1105; AVX2-LABEL: cvt_8i16_to_8f64:
1106; AVX2:       # BB#0:
1107; AVX2-NEXT:    vmovq %xmm0, %rdx
1108; AVX2-NEXT:    movq %rdx, %r9
1109; AVX2-NEXT:    movl %edx, %r10d
1110; AVX2-NEXT:    movswl %dx, %r8d
1111; AVX2-NEXT:    shrq $48, %rdx
1112; AVX2-NEXT:    shrq $32, %r9
1113; AVX2-NEXT:    shrl $16, %r10d
1114; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
1115; AVX2-NEXT:    movq %rdi, %rsi
1116; AVX2-NEXT:    movl %edi, %eax
1117; AVX2-NEXT:    movswl %di, %ecx
1118; AVX2-NEXT:    shrq $48, %rdi
1119; AVX2-NEXT:    shrq $32, %rsi
1120; AVX2-NEXT:    shrl $16, %eax
1121; AVX2-NEXT:    cwtl
1122; AVX2-NEXT:    vmovd %eax, %xmm0
1123; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm1
1124; AVX2-NEXT:    vmovd %ecx, %xmm0
1125; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm2
1126; AVX2-NEXT:    movswl %si, %eax
1127; AVX2-NEXT:    vmovd %eax, %xmm0
1128; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm3
1129; AVX2-NEXT:    movswl %di, %eax
1130; AVX2-NEXT:    vmovd %eax, %xmm0
1131; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm4
1132; AVX2-NEXT:    movswl %r10w, %eax
1133; AVX2-NEXT:    vmovd %eax, %xmm0
1134; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
1135; AVX2-NEXT:    vmovd %r8d, %xmm5
1136; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
1137; AVX2-NEXT:    movswl %r9w, %eax
1138; AVX2-NEXT:    vmovd %eax, %xmm6
1139; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
1140; AVX2-NEXT:    movswl %dx, %eax
1141; AVX2-NEXT:    vmovd %eax, %xmm7
1142; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
1143; AVX2-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
1144; AVX2-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
1145; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1146; AVX2-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
1147; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1148; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm5[0],xmm0[0]
1149; AVX2-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
1150; AVX2-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
1151; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1152; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1153; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1154; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1155; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1156; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
1157; AVX2-NEXT:    retq
1158;
1159; AVX512-LABEL: cvt_8i16_to_8f64:
1160; AVX512:       # BB#0:
1161; AVX512-NEXT:    vpextrq $1, %xmm0, %rdx
1162; AVX512-NEXT:    movq %rdx, %r8
1163; AVX512-NEXT:    movl %edx, %r10d
1164; AVX512-NEXT:    movswl %dx, %r9d
1165; AVX512-NEXT:    shrq $48, %rdx
1166; AVX512-NEXT:    shrq $32, %r8
1167; AVX512-NEXT:    shrl $16, %r10d
1168; AVX512-NEXT:    vmovq %xmm0, %rdi
1169; AVX512-NEXT:    movq %rdi, %rax
1170; AVX512-NEXT:    movl %edi, %esi
1171; AVX512-NEXT:    movswl %di, %ecx
1172; AVX512-NEXT:    shrq $48, %rdi
1173; AVX512-NEXT:    shrq $32, %rax
1174; AVX512-NEXT:    shrl $16, %esi
1175; AVX512-NEXT:    movswl %si, %esi
1176; AVX512-NEXT:    vmovd %esi, %xmm0
1177; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1178; AVX512-NEXT:    vmovd %ecx, %xmm1
1179; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
1180; AVX512-NEXT:    cwtl
1181; AVX512-NEXT:    vmovd %eax, %xmm2
1182; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
1183; AVX512-NEXT:    movswl %di, %eax
1184; AVX512-NEXT:    vmovd %eax, %xmm3
1185; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
1186; AVX512-NEXT:    movswl %r10w, %eax
1187; AVX512-NEXT:    vmovd %eax, %xmm4
1188; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
1189; AVX512-NEXT:    vmovd %r9d, %xmm5
1190; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
1191; AVX512-NEXT:    movswl %r8w, %eax
1192; AVX512-NEXT:    vmovd %eax, %xmm6
1193; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm6
1194; AVX512-NEXT:    movswl %dx, %eax
1195; AVX512-NEXT:    vmovd %eax, %xmm7
1196; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm7
1197; AVX512-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
1198; AVX512-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
1199; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1200; AVX512-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
1201; AVX512-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
1202; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm4 = xmm5[0],xmm4[0]
1203; AVX512-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
1204; AVX512-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1205; AVX512-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1206; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1207; AVX512-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1208; AVX512-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1209; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1210; AVX512-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1211; AVX512-NEXT:    vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
1212; AVX512-NEXT:    retq
1213  %1 = bitcast <8 x i16> %a0 to <8 x half>
1214  %2 = fpext <8 x half> %1 to <8 x double>
1215  ret <8 x double> %2
1216}
1217
1218;
1219; Half to Double (Load)
1220;
1221
1222define double @load_cvt_i16_to_f64(i16* %a0) {
1223; ALL-LABEL: load_cvt_i16_to_f64:
1224; ALL:       # BB#0:
1225; ALL-NEXT:    movswl (%rdi), %eax
1226; ALL-NEXT:    vmovd %eax, %xmm0
1227; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
1228; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1229; ALL-NEXT:    retq
1230  %1 = load i16, i16* %a0
1231  %2 = bitcast i16 %1 to half
1232  %3 = fpext half %2 to double
1233  ret double %3
1234}
1235
1236define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) {
1237; ALL-LABEL: load_cvt_2i16_to_2f64:
1238; ALL:       # BB#0:
1239; ALL-NEXT:    movswl (%rdi), %eax
1240; ALL-NEXT:    vmovd %eax, %xmm0
1241; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
1242; ALL-NEXT:    movswl 2(%rdi), %eax
1243; ALL-NEXT:    vmovd %eax, %xmm1
1244; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
1245; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1246; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1247; ALL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1248; ALL-NEXT:    retq
1249  %1 = load <2 x i16>, <2 x i16>* %a0
1250  %2 = bitcast <2 x i16> %1 to <2 x half>
1251  %3 = fpext <2 x half> %2 to <2 x double>
1252  ret <2 x double> %3
1253}
1254
1255define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) {
1256; ALL-LABEL: load_cvt_4i16_to_4f64:
1257; ALL:       # BB#0:
1258; ALL-NEXT:    movswl (%rdi), %eax
1259; ALL-NEXT:    vmovd %eax, %xmm0
1260; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
1261; ALL-NEXT:    movswl 2(%rdi), %eax
1262; ALL-NEXT:    vmovd %eax, %xmm1
1263; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
1264; ALL-NEXT:    movswl 4(%rdi), %eax
1265; ALL-NEXT:    vmovd %eax, %xmm2
1266; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
1267; ALL-NEXT:    movswl 6(%rdi), %eax
1268; ALL-NEXT:    vmovd %eax, %xmm3
1269; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
1270; ALL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1271; ALL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1272; ALL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1273; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1274; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1275; ALL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1276; ALL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1277; ALL-NEXT:    retq
1278  %1 = load <4 x i16>, <4 x i16>* %a0
1279  %2 = bitcast <4 x i16> %1 to <4 x half>
1280  %3 = fpext <4 x half> %2 to <4 x double>
1281  ret <4 x double> %3
1282}
1283
1284define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) {
1285; ALL-LABEL: load_cvt_8i16_to_4f64:
1286; ALL:       # BB#0:
1287; ALL-NEXT:    movq (%rdi), %rax
1288; ALL-NEXT:    movq %rax, %rcx
1289; ALL-NEXT:    movl %eax, %edx
1290; ALL-NEXT:    movswl %ax, %esi
1291; ALL-NEXT:    shrq $48, %rax
1292; ALL-NEXT:    shrq $32, %rcx
1293; ALL-NEXT:    shrl $16, %edx
1294; ALL-NEXT:    movswl %dx, %edx
1295; ALL-NEXT:    vmovd %edx, %xmm0
1296; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
1297; ALL-NEXT:    vmovd %esi, %xmm1
1298; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
1299; ALL-NEXT:    movswl %cx, %ecx
1300; ALL-NEXT:    vmovd %ecx, %xmm2
1301; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
1302; ALL-NEXT:    cwtl
1303; ALL-NEXT:    vmovd %eax, %xmm3
1304; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
1305; ALL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1306; ALL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1307; ALL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1308; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1309; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1310; ALL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1311; ALL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1312; ALL-NEXT:    retq
1313  %1 = load <8 x i16>, <8 x i16>* %a0
1314  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1315  %3 = bitcast <4 x i16> %2 to <4 x half>
1316  %4 = fpext <4 x half> %3 to <4 x double>
1317  ret <4 x double> %4
1318}
1319
1320define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) {
1321; AVX1-LABEL: load_cvt_8i16_to_8f64:
1322; AVX1:       # BB#0:
1323; AVX1-NEXT:    movswl 8(%rdi), %eax
1324; AVX1-NEXT:    vmovd %eax, %xmm0
1325; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm1
1326; AVX1-NEXT:    movswl 10(%rdi), %eax
1327; AVX1-NEXT:    vmovd %eax, %xmm0
1328; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm2
1329; AVX1-NEXT:    movswl 12(%rdi), %eax
1330; AVX1-NEXT:    vmovd %eax, %xmm0
1331; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm3
1332; AVX1-NEXT:    movswl 14(%rdi), %eax
1333; AVX1-NEXT:    vmovd %eax, %xmm0
1334; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm4
1335; AVX1-NEXT:    movswl (%rdi), %eax
1336; AVX1-NEXT:    vmovd %eax, %xmm0
1337; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
1338; AVX1-NEXT:    movswl 2(%rdi), %eax
1339; AVX1-NEXT:    vmovd %eax, %xmm5
1340; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
1341; AVX1-NEXT:    movswl 4(%rdi), %eax
1342; AVX1-NEXT:    vmovd %eax, %xmm6
1343; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
1344; AVX1-NEXT:    movswl 6(%rdi), %eax
1345; AVX1-NEXT:    vmovd %eax, %xmm7
1346; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
1347; AVX1-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
1348; AVX1-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
1349; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1350; AVX1-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
1351; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1352; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
1353; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
1354; AVX1-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
1355; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1356; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1357; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1358; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1359; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1360; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
1361; AVX1-NEXT:    retq
1362;
1363; AVX2-LABEL: load_cvt_8i16_to_8f64:
1364; AVX2:       # BB#0:
1365; AVX2-NEXT:    movswl 8(%rdi), %eax
1366; AVX2-NEXT:    vmovd %eax, %xmm0
1367; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm1
1368; AVX2-NEXT:    movswl 10(%rdi), %eax
1369; AVX2-NEXT:    vmovd %eax, %xmm0
1370; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm2
1371; AVX2-NEXT:    movswl 12(%rdi), %eax
1372; AVX2-NEXT:    vmovd %eax, %xmm0
1373; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm3
1374; AVX2-NEXT:    movswl 14(%rdi), %eax
1375; AVX2-NEXT:    vmovd %eax, %xmm0
1376; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm4
1377; AVX2-NEXT:    movswl (%rdi), %eax
1378; AVX2-NEXT:    vmovd %eax, %xmm0
1379; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
1380; AVX2-NEXT:    movswl 2(%rdi), %eax
1381; AVX2-NEXT:    vmovd %eax, %xmm5
1382; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
1383; AVX2-NEXT:    movswl 4(%rdi), %eax
1384; AVX2-NEXT:    vmovd %eax, %xmm6
1385; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
1386; AVX2-NEXT:    movswl 6(%rdi), %eax
1387; AVX2-NEXT:    vmovd %eax, %xmm7
1388; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
1389; AVX2-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
1390; AVX2-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
1391; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1392; AVX2-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
1393; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1394; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
1395; AVX2-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
1396; AVX2-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
1397; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1398; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1399; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1400; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1401; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1402; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
1403; AVX2-NEXT:    retq
1404;
1405; AVX512-LABEL: load_cvt_8i16_to_8f64:
1406; AVX512:       # BB#0:
1407; AVX512-NEXT:    movswl (%rdi), %eax
1408; AVX512-NEXT:    vmovd %eax, %xmm0
1409; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1410; AVX512-NEXT:    movswl 2(%rdi), %eax
1411; AVX512-NEXT:    vmovd %eax, %xmm1
1412; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
1413; AVX512-NEXT:    movswl 4(%rdi), %eax
1414; AVX512-NEXT:    vmovd %eax, %xmm2
1415; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
1416; AVX512-NEXT:    movswl 6(%rdi), %eax
1417; AVX512-NEXT:    vmovd %eax, %xmm3
1418; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
1419; AVX512-NEXT:    movswl 8(%rdi), %eax
1420; AVX512-NEXT:    vmovd %eax, %xmm4
1421; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
1422; AVX512-NEXT:    movswl 10(%rdi), %eax
1423; AVX512-NEXT:    vmovd %eax, %xmm5
1424; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
1425; AVX512-NEXT:    movswl 12(%rdi), %eax
1426; AVX512-NEXT:    vmovd %eax, %xmm6
1427; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm6
1428; AVX512-NEXT:    movswl 14(%rdi), %eax
1429; AVX512-NEXT:    vmovd %eax, %xmm7
1430; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm7
1431; AVX512-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
1432; AVX512-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
1433; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1434; AVX512-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
1435; AVX512-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
1436; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm5[0]
1437; AVX512-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
1438; AVX512-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1439; AVX512-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1440; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1441; AVX512-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1442; AVX512-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1443; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1444; AVX512-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1445; AVX512-NEXT:    vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
1446; AVX512-NEXT:    retq
1447  %1 = load <8 x i16>, <8 x i16>* %a0
1448  %2 = bitcast <8 x i16> %1 to <8 x half>
1449  %3 = fpext <8 x half> %2 to <8 x double>
1450  ret <8 x double> %3
1451}
1452
1453;
1454; Float to Half
1455;
1456
1457define i16 @cvt_f32_to_i16(float %a0) {
1458; ALL-LABEL: cvt_f32_to_i16:
1459; ALL:       # BB#0:
1460; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1461; ALL-NEXT:    vmovd %xmm0, %eax
1462; ALL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
1463; ALL-NEXT:    retq
1464  %1 = fptrunc float %a0 to half
1465  %2 = bitcast half %1 to i16
1466  ret i16 %2
1467}
1468
1469define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) {
1470; ALL-LABEL: cvt_4f32_to_4i16:
1471; ALL:       # BB#0:
1472; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1473; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1474; ALL-NEXT:    vmovd %xmm1, %eax
1475; ALL-NEXT:    shll $16, %eax
1476; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
1477; ALL-NEXT:    vmovd %xmm1, %ecx
1478; ALL-NEXT:    movzwl %cx, %ecx
1479; ALL-NEXT:    orl %eax, %ecx
1480; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1481; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1482; ALL-NEXT:    vmovd %xmm1, %eax
1483; ALL-NEXT:    shll $16, %eax
1484; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1485; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1486; ALL-NEXT:    vmovd %xmm0, %edx
1487; ALL-NEXT:    movzwl %dx, %edx
1488; ALL-NEXT:    orl %eax, %edx
1489; ALL-NEXT:    shlq $32, %rdx
1490; ALL-NEXT:    orq %rcx, %rdx
1491; ALL-NEXT:    vmovq %rdx, %xmm0
1492; ALL-NEXT:    retq
1493  %1 = fptrunc <4 x float> %a0 to <4 x half>
1494  %2 = bitcast <4 x half> %1 to <4 x i16>
1495  ret <4 x i16> %2
1496}
1497
1498define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) {
1499; ALL-LABEL: cvt_4f32_to_8i16_undef:
1500; ALL:       # BB#0:
1501; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1502; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1503; ALL-NEXT:    vmovd %xmm1, %eax
1504; ALL-NEXT:    shll $16, %eax
1505; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
1506; ALL-NEXT:    vmovd %xmm1, %ecx
1507; ALL-NEXT:    movzwl %cx, %ecx
1508; ALL-NEXT:    orl %eax, %ecx
1509; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1510; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1511; ALL-NEXT:    vmovd %xmm1, %eax
1512; ALL-NEXT:    shll $16, %eax
1513; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1514; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1515; ALL-NEXT:    vmovd %xmm0, %edx
1516; ALL-NEXT:    movzwl %dx, %edx
1517; ALL-NEXT:    orl %eax, %edx
1518; ALL-NEXT:    shlq $32, %rdx
1519; ALL-NEXT:    orq %rcx, %rdx
1520; ALL-NEXT:    vmovq %rdx, %xmm0
1521; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1522; ALL-NEXT:    retq
1523  %1 = fptrunc <4 x float> %a0 to <4 x half>
1524  %2 = bitcast <4 x half> %1 to <4 x i16>
1525  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1526  ret <8 x i16> %3
1527}
1528
1529define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) {
1530; ALL-LABEL: cvt_4f32_to_8i16_zero:
1531; ALL:       # BB#0:
1532; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1533; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1534; ALL-NEXT:    vmovd %xmm1, %eax
1535; ALL-NEXT:    shll $16, %eax
1536; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
1537; ALL-NEXT:    vmovd %xmm1, %ecx
1538; ALL-NEXT:    movzwl %cx, %ecx
1539; ALL-NEXT:    orl %eax, %ecx
1540; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1541; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1542; ALL-NEXT:    vmovd %xmm1, %eax
1543; ALL-NEXT:    shll $16, %eax
1544; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1545; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1546; ALL-NEXT:    vmovd %xmm0, %edx
1547; ALL-NEXT:    movzwl %dx, %edx
1548; ALL-NEXT:    orl %eax, %edx
1549; ALL-NEXT:    shlq $32, %rdx
1550; ALL-NEXT:    orq %rcx, %rdx
1551; ALL-NEXT:    vmovq %rdx, %xmm0
1552; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
1553; ALL-NEXT:    retq
1554  %1 = fptrunc <4 x float> %a0 to <4 x half>
1555  %2 = bitcast <4 x half> %1 to <4 x i16>
1556  %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1557  ret <8 x i16> %3
1558}
1559
1560define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) {
1561; AVX1-LABEL: cvt_8f32_to_8i16:
1562; AVX1:       # BB#0:
1563; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1564; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1565; AVX1-NEXT:    vmovd %xmm1, %eax
1566; AVX1-NEXT:    shll $16, %eax
1567; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
1568; AVX1-NEXT:    vmovd %xmm1, %ecx
1569; AVX1-NEXT:    movzwl %cx, %ecx
1570; AVX1-NEXT:    orl %eax, %ecx
1571; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1572; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1573; AVX1-NEXT:    vmovd %xmm1, %edx
1574; AVX1-NEXT:    shll $16, %edx
1575; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1576; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1577; AVX1-NEXT:    vmovd %xmm1, %eax
1578; AVX1-NEXT:    movzwl %ax, %eax
1579; AVX1-NEXT:    orl %edx, %eax
1580; AVX1-NEXT:    shlq $32, %rax
1581; AVX1-NEXT:    orq %rcx, %rax
1582; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1583; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1584; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1585; AVX1-NEXT:    vmovd %xmm1, %ecx
1586; AVX1-NEXT:    shll $16, %ecx
1587; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
1588; AVX1-NEXT:    vmovd %xmm1, %edx
1589; AVX1-NEXT:    movzwl %dx, %edx
1590; AVX1-NEXT:    orl %ecx, %edx
1591; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1592; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1593; AVX1-NEXT:    vmovd %xmm1, %ecx
1594; AVX1-NEXT:    shll $16, %ecx
1595; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1596; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1597; AVX1-NEXT:    vmovd %xmm0, %esi
1598; AVX1-NEXT:    movzwl %si, %esi
1599; AVX1-NEXT:    orl %ecx, %esi
1600; AVX1-NEXT:    shlq $32, %rsi
1601; AVX1-NEXT:    orq %rdx, %rsi
1602; AVX1-NEXT:    vmovq %rsi, %xmm0
1603; AVX1-NEXT:    vmovq %rax, %xmm1
1604; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1605; AVX1-NEXT:    vzeroupper
1606; AVX1-NEXT:    retq
1607;
1608; AVX2-LABEL: cvt_8f32_to_8i16:
1609; AVX2:       # BB#0:
1610; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1611; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1612; AVX2-NEXT:    vmovd %xmm1, %eax
1613; AVX2-NEXT:    shll $16, %eax
1614; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
1615; AVX2-NEXT:    vmovd %xmm1, %ecx
1616; AVX2-NEXT:    movzwl %cx, %ecx
1617; AVX2-NEXT:    orl %eax, %ecx
1618; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1619; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1620; AVX2-NEXT:    vmovd %xmm1, %edx
1621; AVX2-NEXT:    shll $16, %edx
1622; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1623; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1624; AVX2-NEXT:    vmovd %xmm1, %eax
1625; AVX2-NEXT:    movzwl %ax, %eax
1626; AVX2-NEXT:    orl %edx, %eax
1627; AVX2-NEXT:    shlq $32, %rax
1628; AVX2-NEXT:    orq %rcx, %rax
1629; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
1630; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1631; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1632; AVX2-NEXT:    vmovd %xmm1, %ecx
1633; AVX2-NEXT:    shll $16, %ecx
1634; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
1635; AVX2-NEXT:    vmovd %xmm1, %edx
1636; AVX2-NEXT:    movzwl %dx, %edx
1637; AVX2-NEXT:    orl %ecx, %edx
1638; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1639; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1640; AVX2-NEXT:    vmovd %xmm1, %ecx
1641; AVX2-NEXT:    shll $16, %ecx
1642; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1643; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1644; AVX2-NEXT:    vmovd %xmm0, %esi
1645; AVX2-NEXT:    movzwl %si, %esi
1646; AVX2-NEXT:    orl %ecx, %esi
1647; AVX2-NEXT:    shlq $32, %rsi
1648; AVX2-NEXT:    orq %rdx, %rsi
1649; AVX2-NEXT:    vmovq %rsi, %xmm0
1650; AVX2-NEXT:    vmovq %rax, %xmm1
1651; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1652; AVX2-NEXT:    vzeroupper
1653; AVX2-NEXT:    retq
1654;
1655; AVX512-LABEL: cvt_8f32_to_8i16:
1656; AVX512:       # BB#0:
1657; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1658; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1659; AVX512-NEXT:    vmovd %xmm1, %eax
1660; AVX512-NEXT:    shll $16, %eax
1661; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
1662; AVX512-NEXT:    vmovd %xmm1, %ecx
1663; AVX512-NEXT:    movzwl %cx, %ecx
1664; AVX512-NEXT:    orl %eax, %ecx
1665; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1666; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1667; AVX512-NEXT:    vmovd %xmm1, %edx
1668; AVX512-NEXT:    shll $16, %edx
1669; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1670; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1671; AVX512-NEXT:    vmovd %xmm1, %eax
1672; AVX512-NEXT:    movzwl %ax, %eax
1673; AVX512-NEXT:    orl %edx, %eax
1674; AVX512-NEXT:    shlq $32, %rax
1675; AVX512-NEXT:    orq %rcx, %rax
1676; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
1677; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1678; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1679; AVX512-NEXT:    vmovd %xmm1, %ecx
1680; AVX512-NEXT:    shll $16, %ecx
1681; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
1682; AVX512-NEXT:    vmovd %xmm1, %edx
1683; AVX512-NEXT:    movzwl %dx, %edx
1684; AVX512-NEXT:    orl %ecx, %edx
1685; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1686; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1687; AVX512-NEXT:    vmovd %xmm1, %ecx
1688; AVX512-NEXT:    shll $16, %ecx
1689; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1690; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1691; AVX512-NEXT:    vmovd %xmm0, %esi
1692; AVX512-NEXT:    movzwl %si, %esi
1693; AVX512-NEXT:    orl %ecx, %esi
1694; AVX512-NEXT:    shlq $32, %rsi
1695; AVX512-NEXT:    orq %rdx, %rsi
1696; AVX512-NEXT:    vmovq %rsi, %xmm0
1697; AVX512-NEXT:    vmovq %rax, %xmm1
1698; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1699; AVX512-NEXT:    retq
1700  %1 = fptrunc <8 x float> %a0 to <8 x half>
1701  %2 = bitcast <8 x half> %1 to <8 x i16>
1702  ret <8 x i16> %2
1703}
1704
1705define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) {
1706; AVX1-LABEL: cvt_16f32_to_16i16:
1707; AVX1:       # BB#0:
1708; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm2
1709; AVX1-NEXT:    vmovd %xmm2, %eax
1710; AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1711; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
1712; AVX1-NEXT:    vmovd %eax, %xmm3
1713; AVX1-NEXT:    vmovd %xmm2, %eax
1714; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1715; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
1716; AVX1-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
1717; AVX1-NEXT:    vmovd %xmm2, %eax
1718; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1719; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
1720; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1721; AVX1-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
1722; AVX1-NEXT:    vmovd %xmm1, %eax
1723; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
1724; AVX1-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
1725; AVX1-NEXT:    vmovd %xmm1, %eax
1726; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1727; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1728; AVX1-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
1729; AVX1-NEXT:    vmovd %xmm1, %eax
1730; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1731; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1732; AVX1-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
1733; AVX1-NEXT:    vmovd %xmm1, %eax
1734; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
1735; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
1736; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
1737; AVX1-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
1738; AVX1-NEXT:    vmovd %xmm2, %eax
1739; AVX1-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm2
1740; AVX1-NEXT:    vmovd %xmm1, %eax
1741; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1742; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1743; AVX1-NEXT:    vmovd %eax, %xmm3
1744; AVX1-NEXT:    vmovd %xmm1, %eax
1745; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1746; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1747; AVX1-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
1748; AVX1-NEXT:    vmovd %xmm1, %eax
1749; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1750; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1751; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1752; AVX1-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
1753; AVX1-NEXT:    vmovd %xmm0, %eax
1754; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
1755; AVX1-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
1756; AVX1-NEXT:    vmovd %xmm0, %eax
1757; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1758; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1759; AVX1-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
1760; AVX1-NEXT:    vmovd %xmm0, %eax
1761; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
1762; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1763; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1764; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1765; AVX1-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
1766; AVX1-NEXT:    vmovd %xmm1, %eax
1767; AVX1-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm1
1768; AVX1-NEXT:    vmovd %xmm0, %eax
1769; AVX1-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
1770; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1771; AVX1-NEXT:    retq
1772;
1773; AVX2-LABEL: cvt_16f32_to_16i16:
1774; AVX2:       # BB#0:
1775; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm2
1776; AVX2-NEXT:    vmovd %xmm2, %eax
1777; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1778; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
1779; AVX2-NEXT:    vmovd %eax, %xmm3
1780; AVX2-NEXT:    vmovd %xmm2, %eax
1781; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1782; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
1783; AVX2-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
1784; AVX2-NEXT:    vmovd %xmm2, %eax
1785; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
1786; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
1787; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1788; AVX2-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
1789; AVX2-NEXT:    vmovd %xmm1, %eax
1790; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
1791; AVX2-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
1792; AVX2-NEXT:    vmovd %xmm1, %eax
1793; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1794; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1795; AVX2-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
1796; AVX2-NEXT:    vmovd %xmm1, %eax
1797; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1798; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1799; AVX2-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
1800; AVX2-NEXT:    vmovd %xmm1, %eax
1801; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
1802; AVX2-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
1803; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
1804; AVX2-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
1805; AVX2-NEXT:    vmovd %xmm2, %eax
1806; AVX2-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm2
1807; AVX2-NEXT:    vmovd %xmm1, %eax
1808; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1809; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1810; AVX2-NEXT:    vmovd %eax, %xmm3
1811; AVX2-NEXT:    vmovd %xmm1, %eax
1812; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1813; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1814; AVX2-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
1815; AVX2-NEXT:    vmovd %xmm1, %eax
1816; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
1817; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1818; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1819; AVX2-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
1820; AVX2-NEXT:    vmovd %xmm0, %eax
1821; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
1822; AVX2-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
1823; AVX2-NEXT:    vmovd %xmm0, %eax
1824; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1825; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1826; AVX2-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
1827; AVX2-NEXT:    vmovd %xmm0, %eax
1828; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
1829; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1830; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1831; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1832; AVX2-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
1833; AVX2-NEXT:    vmovd %xmm1, %eax
1834; AVX2-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm1
1835; AVX2-NEXT:    vmovd %xmm0, %eax
1836; AVX2-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
1837; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
1838; AVX2-NEXT:    retq
1839;
1840; AVX512-LABEL: cvt_16f32_to_16i16:
1841; AVX512:       # BB#0:
1842; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
1843; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm2
1844; AVX512-NEXT:    vmovd %xmm2, %eax
1845; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1846; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
1847; AVX512-NEXT:    vmovd %eax, %xmm3
1848; AVX512-NEXT:    vmovd %xmm2, %eax
1849; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1850; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
1851; AVX512-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
1852; AVX512-NEXT:    vmovd %xmm2, %eax
1853; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
1854; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
1855; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1856; AVX512-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
1857; AVX512-NEXT:    vmovd %xmm1, %eax
1858; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
1859; AVX512-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
1860; AVX512-NEXT:    vmovd %xmm1, %eax
1861; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1862; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1863; AVX512-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
1864; AVX512-NEXT:    vmovd %xmm1, %eax
1865; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1866; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1867; AVX512-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
1868; AVX512-NEXT:    vmovd %xmm1, %eax
1869; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
1870; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
1871; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
1872; AVX512-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
1873; AVX512-NEXT:    vmovd %xmm2, %eax
1874; AVX512-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm2
1875; AVX512-NEXT:    vmovd %xmm1, %eax
1876; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1877; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1878; AVX512-NEXT:    vmovd %eax, %xmm3
1879; AVX512-NEXT:    vmovd %xmm1, %eax
1880; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1881; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1882; AVX512-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
1883; AVX512-NEXT:    vmovd %xmm1, %eax
1884; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
1885; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1886; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1887; AVX512-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
1888; AVX512-NEXT:    vmovd %xmm0, %eax
1889; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
1890; AVX512-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
1891; AVX512-NEXT:    vmovd %xmm0, %eax
1892; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1893; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1894; AVX512-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
1895; AVX512-NEXT:    vmovd %xmm0, %eax
1896; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
1897; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1898; AVX512-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
1899; AVX512-NEXT:    vmovd %xmm0, %eax
1900; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
1901; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1902; AVX512-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm1
1903; AVX512-NEXT:    vmovd %xmm0, %eax
1904; AVX512-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
1905; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
1906; AVX512-NEXT:    retq
1907  %1 = fptrunc <16 x float> %a0 to <16 x half>
1908  %2 = bitcast <16 x half> %1 to <16 x i16>
1909  ret <16 x i16> %2
1910}
1911
1912;
1913; Float to Half (Store)
1914;
1915
1916define void @store_cvt_f32_to_i16(float %a0, i16* %a1) {
1917; ALL-LABEL: store_cvt_f32_to_i16:
1918; ALL:       # BB#0:
1919; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1920; ALL-NEXT:    vmovd %xmm0, %eax
1921; ALL-NEXT:    movw %ax, (%rdi)
1922; ALL-NEXT:    retq
1923  %1 = fptrunc float %a0 to half
1924  %2 = bitcast half %1 to i16
1925  store i16 %2, i16* %a1
1926  ret void
1927}
1928
1929define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) {
1930; ALL-LABEL: store_cvt_4f32_to_4i16:
1931; ALL:       # BB#0:
1932; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1933; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1934; ALL-NEXT:    vmovd %xmm1, %eax
1935; ALL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1936; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1937; ALL-NEXT:    vmovd %xmm1, %ecx
1938; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1939; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1940; ALL-NEXT:    vmovd %xmm1, %edx
1941; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1942; ALL-NEXT:    vmovd %xmm0, %esi
1943; ALL-NEXT:    movw %si, (%rdi)
1944; ALL-NEXT:    movw %dx, 6(%rdi)
1945; ALL-NEXT:    movw %cx, 4(%rdi)
1946; ALL-NEXT:    movw %ax, 2(%rdi)
1947; ALL-NEXT:    retq
1948  %1 = fptrunc <4 x float> %a0 to <4 x half>
1949  %2 = bitcast <4 x half> %1 to <4 x i16>
1950  store <4 x i16> %2, <4 x i16>* %a1
1951  ret void
1952}
1953
1954define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) {
1955; ALL-LABEL: store_cvt_4f32_to_8i16_undef:
1956; ALL:       # BB#0:
1957; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1958; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1959; ALL-NEXT:    vmovd %xmm1, %eax
1960; ALL-NEXT:    shll $16, %eax
1961; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
1962; ALL-NEXT:    vmovd %xmm1, %ecx
1963; ALL-NEXT:    movzwl %cx, %ecx
1964; ALL-NEXT:    orl %eax, %ecx
1965; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1966; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1967; ALL-NEXT:    vmovd %xmm1, %eax
1968; ALL-NEXT:    shll $16, %eax
1969; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1970; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1971; ALL-NEXT:    vmovd %xmm0, %edx
1972; ALL-NEXT:    movzwl %dx, %edx
1973; ALL-NEXT:    orl %eax, %edx
1974; ALL-NEXT:    shlq $32, %rdx
1975; ALL-NEXT:    orq %rcx, %rdx
1976; ALL-NEXT:    vmovq %rdx, %xmm0
1977; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1978; ALL-NEXT:    vmovdqa %xmm0, (%rdi)
1979; ALL-NEXT:    retq
1980  %1 = fptrunc <4 x float> %a0 to <4 x half>
1981  %2 = bitcast <4 x half> %1 to <4 x i16>
1982  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1983  store <8 x i16> %3, <8 x i16>* %a1
1984  ret void
1985}
1986
1987define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) {
1988; ALL-LABEL: store_cvt_4f32_to_8i16_zero:
1989; ALL:       # BB#0:
1990; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1991; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
1992; ALL-NEXT:    vmovd %xmm1, %eax
1993; ALL-NEXT:    shll $16, %eax
1994; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
1995; ALL-NEXT:    vmovd %xmm1, %ecx
1996; ALL-NEXT:    movzwl %cx, %ecx
1997; ALL-NEXT:    orl %eax, %ecx
1998; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1999; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2000; ALL-NEXT:    vmovd %xmm1, %eax
2001; ALL-NEXT:    shll $16, %eax
2002; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2003; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2004; ALL-NEXT:    vmovd %xmm0, %edx
2005; ALL-NEXT:    movzwl %dx, %edx
2006; ALL-NEXT:    orl %eax, %edx
2007; ALL-NEXT:    shlq $32, %rdx
2008; ALL-NEXT:    orq %rcx, %rdx
2009; ALL-NEXT:    vmovq %rdx, %xmm0
2010; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
2011; ALL-NEXT:    vmovdqa %xmm0, (%rdi)
2012; ALL-NEXT:    retq
2013  %1 = fptrunc <4 x float> %a0 to <4 x half>
2014  %2 = bitcast <4 x half> %1 to <4 x i16>
2015  %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2016  store <8 x i16> %3, <8 x i16>* %a1
2017  ret void
2018}
2019
2020define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) {
2021; AVX1-LABEL: store_cvt_8f32_to_8i16:
2022; AVX1:       # BB#0:
2023; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2024; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2025; AVX1-NEXT:    vmovd %xmm1, %r8d
2026; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2027; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2028; AVX1-NEXT:    vmovd %xmm1, %r9d
2029; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2030; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2031; AVX1-NEXT:    vmovd %xmm1, %r10d
2032; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2033; AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
2034; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2035; AVX1-NEXT:    vmovd %xmm2, %r11d
2036; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
2037; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2038; AVX1-NEXT:    vmovd %xmm2, %eax
2039; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
2040; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2041; AVX1-NEXT:    vmovd %xmm2, %ecx
2042; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2043; AVX1-NEXT:    vmovd %xmm0, %edx
2044; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
2045; AVX1-NEXT:    vmovd %xmm0, %esi
2046; AVX1-NEXT:    movw %si, 8(%rdi)
2047; AVX1-NEXT:    movw %dx, (%rdi)
2048; AVX1-NEXT:    movw %cx, 14(%rdi)
2049; AVX1-NEXT:    movw %ax, 12(%rdi)
2050; AVX1-NEXT:    movw %r11w, 10(%rdi)
2051; AVX1-NEXT:    movw %r10w, 6(%rdi)
2052; AVX1-NEXT:    movw %r9w, 4(%rdi)
2053; AVX1-NEXT:    movw %r8w, 2(%rdi)
2054; AVX1-NEXT:    vzeroupper
2055; AVX1-NEXT:    retq
2056;
2057; AVX2-LABEL: store_cvt_8f32_to_8i16:
2058; AVX2:       # BB#0:
2059; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2060; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2061; AVX2-NEXT:    vmovd %xmm1, %r8d
2062; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2063; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2064; AVX2-NEXT:    vmovd %xmm1, %r9d
2065; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2066; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2067; AVX2-NEXT:    vmovd %xmm1, %r10d
2068; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
2069; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
2070; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2071; AVX2-NEXT:    vmovd %xmm2, %r11d
2072; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
2073; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2074; AVX2-NEXT:    vmovd %xmm2, %eax
2075; AVX2-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
2076; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2077; AVX2-NEXT:    vmovd %xmm2, %ecx
2078; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2079; AVX2-NEXT:    vmovd %xmm0, %edx
2080; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
2081; AVX2-NEXT:    vmovd %xmm0, %esi
2082; AVX2-NEXT:    movw %si, 8(%rdi)
2083; AVX2-NEXT:    movw %dx, (%rdi)
2084; AVX2-NEXT:    movw %cx, 14(%rdi)
2085; AVX2-NEXT:    movw %ax, 12(%rdi)
2086; AVX2-NEXT:    movw %r11w, 10(%rdi)
2087; AVX2-NEXT:    movw %r10w, 6(%rdi)
2088; AVX2-NEXT:    movw %r9w, 4(%rdi)
2089; AVX2-NEXT:    movw %r8w, 2(%rdi)
2090; AVX2-NEXT:    vzeroupper
2091; AVX2-NEXT:    retq
2092;
2093; AVX512-LABEL: store_cvt_8f32_to_8i16:
2094; AVX512:       # BB#0:
2095; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2096; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2097; AVX512-NEXT:    vmovd %xmm1, %r8d
2098; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2099; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2100; AVX512-NEXT:    vmovd %xmm1, %r9d
2101; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2102; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2103; AVX512-NEXT:    vmovd %xmm1, %r10d
2104; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
2105; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
2106; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2107; AVX512-NEXT:    vmovd %xmm2, %r11d
2108; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
2109; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2110; AVX512-NEXT:    vmovd %xmm2, %eax
2111; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
2112; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2113; AVX512-NEXT:    vmovd %xmm2, %ecx
2114; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2115; AVX512-NEXT:    vmovd %xmm0, %edx
2116; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
2117; AVX512-NEXT:    vmovd %xmm0, %esi
2118; AVX512-NEXT:    movw %si, 8(%rdi)
2119; AVX512-NEXT:    movw %dx, (%rdi)
2120; AVX512-NEXT:    movw %cx, 14(%rdi)
2121; AVX512-NEXT:    movw %ax, 12(%rdi)
2122; AVX512-NEXT:    movw %r11w, 10(%rdi)
2123; AVX512-NEXT:    movw %r10w, 6(%rdi)
2124; AVX512-NEXT:    movw %r9w, 4(%rdi)
2125; AVX512-NEXT:    movw %r8w, 2(%rdi)
2126; AVX512-NEXT:    retq
2127  %1 = fptrunc <8 x float> %a0 to <8 x half>
2128  %2 = bitcast <8 x half> %1 to <8 x i16>
2129  store <8 x i16> %2, <8 x i16>* %a1
2130  ret void
2131}
2132
2133define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) {
2134; AVX1-LABEL: store_cvt_16f32_to_16i16:
2135; AVX1:       # BB#0:
2136; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2137; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2138; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm4
2139; AVX1-NEXT:    vmovd %xmm4, %eax
2140; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm4
2141; AVX1-NEXT:    movw %ax, 24(%rdi)
2142; AVX1-NEXT:    vmovd %xmm4, %eax
2143; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm4
2144; AVX1-NEXT:    movw %ax, 16(%rdi)
2145; AVX1-NEXT:    vmovd %xmm4, %eax
2146; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm4
2147; AVX1-NEXT:    movw %ax, 8(%rdi)
2148; AVX1-NEXT:    vmovd %xmm4, %eax
2149; AVX1-NEXT:    vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
2150; AVX1-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
2151; AVX1-NEXT:    movw %ax, (%rdi)
2152; AVX1-NEXT:    vmovd %xmm4, %eax
2153; AVX1-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
2154; AVX1-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
2155; AVX1-NEXT:    movw %ax, 30(%rdi)
2156; AVX1-NEXT:    vmovd %xmm4, %eax
2157; AVX1-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
2158; AVX1-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
2159; AVX1-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
2160; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2161; AVX1-NEXT:    movw %ax, 28(%rdi)
2162; AVX1-NEXT:    vmovd %xmm3, %eax
2163; AVX1-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
2164; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2165; AVX1-NEXT:    movw %ax, 26(%rdi)
2166; AVX1-NEXT:    vmovd %xmm3, %eax
2167; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
2168; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2169; AVX1-NEXT:    movw %ax, 22(%rdi)
2170; AVX1-NEXT:    vmovd %xmm3, %eax
2171; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
2172; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2173; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2174; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2175; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
2176; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2177; AVX1-NEXT:    movw %ax, 20(%rdi)
2178; AVX1-NEXT:    vmovd %xmm1, %eax
2179; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
2180; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2181; AVX1-NEXT:    movw %ax, 18(%rdi)
2182; AVX1-NEXT:    vmovd %xmm1, %eax
2183; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
2184; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2185; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
2186; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2187; AVX1-NEXT:    movw %ax, 14(%rdi)
2188; AVX1-NEXT:    vmovd %xmm2, %eax
2189; AVX1-NEXT:    movw %ax, 12(%rdi)
2190; AVX1-NEXT:    vmovd %xmm1, %eax
2191; AVX1-NEXT:    movw %ax, 10(%rdi)
2192; AVX1-NEXT:    vmovd %xmm0, %eax
2193; AVX1-NEXT:    movw %ax, 6(%rdi)
2194; AVX1-NEXT:    vmovd %xmm3, %eax
2195; AVX1-NEXT:    movw %ax, 4(%rdi)
2196; AVX1-NEXT:    vmovd %xmm4, %eax
2197; AVX1-NEXT:    movw %ax, 2(%rdi)
2198; AVX1-NEXT:    vzeroupper
2199; AVX1-NEXT:    retq
2200;
2201; AVX2-LABEL: store_cvt_16f32_to_16i16:
2202; AVX2:       # BB#0:
2203; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm2
2204; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm3
2205; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm4
2206; AVX2-NEXT:    vmovd %xmm4, %eax
2207; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm4
2208; AVX2-NEXT:    movw %ax, 24(%rdi)
2209; AVX2-NEXT:    vmovd %xmm4, %eax
2210; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm4
2211; AVX2-NEXT:    movw %ax, 16(%rdi)
2212; AVX2-NEXT:    vmovd %xmm4, %eax
2213; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm4
2214; AVX2-NEXT:    movw %ax, 8(%rdi)
2215; AVX2-NEXT:    vmovd %xmm4, %eax
2216; AVX2-NEXT:    vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
2217; AVX2-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
2218; AVX2-NEXT:    movw %ax, (%rdi)
2219; AVX2-NEXT:    vmovd %xmm4, %eax
2220; AVX2-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
2221; AVX2-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
2222; AVX2-NEXT:    movw %ax, 30(%rdi)
2223; AVX2-NEXT:    vmovd %xmm4, %eax
2224; AVX2-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
2225; AVX2-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
2226; AVX2-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
2227; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2228; AVX2-NEXT:    movw %ax, 28(%rdi)
2229; AVX2-NEXT:    vmovd %xmm3, %eax
2230; AVX2-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
2231; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2232; AVX2-NEXT:    movw %ax, 26(%rdi)
2233; AVX2-NEXT:    vmovd %xmm3, %eax
2234; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
2235; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2236; AVX2-NEXT:    movw %ax, 22(%rdi)
2237; AVX2-NEXT:    vmovd %xmm3, %eax
2238; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
2239; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2240; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2241; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2242; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
2243; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2244; AVX2-NEXT:    movw %ax, 20(%rdi)
2245; AVX2-NEXT:    vmovd %xmm1, %eax
2246; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
2247; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2248; AVX2-NEXT:    movw %ax, 18(%rdi)
2249; AVX2-NEXT:    vmovd %xmm1, %eax
2250; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
2251; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2252; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
2253; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2254; AVX2-NEXT:    movw %ax, 14(%rdi)
2255; AVX2-NEXT:    vmovd %xmm2, %eax
2256; AVX2-NEXT:    movw %ax, 12(%rdi)
2257; AVX2-NEXT:    vmovd %xmm1, %eax
2258; AVX2-NEXT:    movw %ax, 10(%rdi)
2259; AVX2-NEXT:    vmovd %xmm0, %eax
2260; AVX2-NEXT:    movw %ax, 6(%rdi)
2261; AVX2-NEXT:    vmovd %xmm3, %eax
2262; AVX2-NEXT:    movw %ax, 4(%rdi)
2263; AVX2-NEXT:    vmovd %xmm4, %eax
2264; AVX2-NEXT:    movw %ax, 2(%rdi)
2265; AVX2-NEXT:    vzeroupper
2266; AVX2-NEXT:    retq
2267;
2268; AVX512-LABEL: store_cvt_16f32_to_16i16:
2269; AVX512:       # BB#0:
2270; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
2271; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
2272; AVX512-NEXT:    vextractf128 $1, %ymm2, %xmm3
2273; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm4
2274; AVX512-NEXT:    vmovd %xmm4, %eax
2275; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm4
2276; AVX512-NEXT:    movw %ax, 24(%rdi)
2277; AVX512-NEXT:    vmovd %xmm4, %eax
2278; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm4
2279; AVX512-NEXT:    movw %ax, 16(%rdi)
2280; AVX512-NEXT:    vmovd %xmm4, %eax
2281; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm4
2282; AVX512-NEXT:    movw %ax, 8(%rdi)
2283; AVX512-NEXT:    vmovd %xmm4, %eax
2284; AVX512-NEXT:    vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
2285; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
2286; AVX512-NEXT:    movw %ax, (%rdi)
2287; AVX512-NEXT:    vmovd %xmm4, %eax
2288; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
2289; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
2290; AVX512-NEXT:    movw %ax, 30(%rdi)
2291; AVX512-NEXT:    vmovd %xmm4, %eax
2292; AVX512-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
2293; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
2294; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
2295; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2296; AVX512-NEXT:    movw %ax, 28(%rdi)
2297; AVX512-NEXT:    vmovd %xmm3, %eax
2298; AVX512-NEXT:    vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
2299; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2300; AVX512-NEXT:    movw %ax, 26(%rdi)
2301; AVX512-NEXT:    vmovd %xmm3, %eax
2302; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
2303; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2304; AVX512-NEXT:    movw %ax, 22(%rdi)
2305; AVX512-NEXT:    vmovd %xmm3, %eax
2306; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
2307; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2308; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2309; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2310; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
2311; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2312; AVX512-NEXT:    movw %ax, 20(%rdi)
2313; AVX512-NEXT:    vmovd %xmm2, %eax
2314; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
2315; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2316; AVX512-NEXT:    movw %ax, 18(%rdi)
2317; AVX512-NEXT:    vmovd %xmm2, %eax
2318; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
2319; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2320; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
2321; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2322; AVX512-NEXT:    movw %ax, 14(%rdi)
2323; AVX512-NEXT:    vmovd %xmm1, %eax
2324; AVX512-NEXT:    movw %ax, 12(%rdi)
2325; AVX512-NEXT:    vmovd %xmm2, %eax
2326; AVX512-NEXT:    movw %ax, 10(%rdi)
2327; AVX512-NEXT:    vmovd %xmm0, %eax
2328; AVX512-NEXT:    movw %ax, 6(%rdi)
2329; AVX512-NEXT:    vmovd %xmm3, %eax
2330; AVX512-NEXT:    movw %ax, 4(%rdi)
2331; AVX512-NEXT:    vmovd %xmm4, %eax
2332; AVX512-NEXT:    movw %ax, 2(%rdi)
2333; AVX512-NEXT:    retq
2334  %1 = fptrunc <16 x float> %a0 to <16 x half>
2335  %2 = bitcast <16 x half> %1 to <16 x i16>
2336  store <16 x i16> %2, <16 x i16>* %a1
2337  ret void
2338}
2339
2340;
2341; Double to Half
2342;
2343
2344define i16 @cvt_f64_to_i16(double %a0) {
2345; ALL-LABEL: cvt_f64_to_i16:
2346; ALL:       # BB#0:
2347; ALL-NEXT:    jmp __truncdfhf2 # TAILCALL
2348  %1 = fptrunc double %a0 to half
2349  %2 = bitcast half %1 to i16
2350  ret i16 %2
2351}
2352
2353define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) {
2354; ALL-LABEL: cvt_2f64_to_2i16:
2355; ALL:       # BB#0:
2356; ALL-NEXT:    pushq %rbx
2357; ALL-NEXT:  .Ltmp0:
2358; ALL-NEXT:    .cfi_def_cfa_offset 16
2359; ALL-NEXT:    subq $16, %rsp
2360; ALL-NEXT:  .Ltmp1:
2361; ALL-NEXT:    .cfi_def_cfa_offset 32
2362; ALL-NEXT:  .Ltmp2:
2363; ALL-NEXT:    .cfi_offset %rbx, -16
2364; ALL-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
2365; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2366; ALL-NEXT:    callq __truncdfhf2
2367; ALL-NEXT:    movw %ax, %bx
2368; ALL-NEXT:    shll $16, %ebx
2369; ALL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2370; ALL-NEXT:    callq __truncdfhf2
2371; ALL-NEXT:    movzwl %ax, %eax
2372; ALL-NEXT:    orl %ebx, %eax
2373; ALL-NEXT:    vmovd %eax, %xmm0
2374; ALL-NEXT:    addq $16, %rsp
2375; ALL-NEXT:    popq %rbx
2376; ALL-NEXT:    retq
2377  %1 = fptrunc <2 x double> %a0 to <2 x half>
2378  %2 = bitcast <2 x half> %1 to <2 x i16>
2379  ret <2 x i16> %2
2380}
2381
2382define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) {
2383; AVX1-LABEL: cvt_4f64_to_4i16:
2384; AVX1:       # BB#0:
2385; AVX1-NEXT:    pushq %r14
2386; AVX1-NEXT:  .Ltmp3:
2387; AVX1-NEXT:    .cfi_def_cfa_offset 16
2388; AVX1-NEXT:    pushq %rbx
2389; AVX1-NEXT:  .Ltmp4:
2390; AVX1-NEXT:    .cfi_def_cfa_offset 24
2391; AVX1-NEXT:    subq $40, %rsp
2392; AVX1-NEXT:  .Ltmp5:
2393; AVX1-NEXT:    .cfi_def_cfa_offset 64
2394; AVX1-NEXT:  .Ltmp6:
2395; AVX1-NEXT:    .cfi_offset %rbx, -24
2396; AVX1-NEXT:  .Ltmp7:
2397; AVX1-NEXT:    .cfi_offset %r14, -16
2398; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
2399; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2400; AVX1-NEXT:    vzeroupper
2401; AVX1-NEXT:    callq __truncdfhf2
2402; AVX1-NEXT:    movw %ax, %bx
2403; AVX1-NEXT:    shll $16, %ebx
2404; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
2405; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2406; AVX1-NEXT:    vzeroupper
2407; AVX1-NEXT:    callq __truncdfhf2
2408; AVX1-NEXT:    movzwl %ax, %r14d
2409; AVX1-NEXT:    orl %ebx, %r14d
2410; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
2411; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2412; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
2413; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2414; AVX1-NEXT:    vzeroupper
2415; AVX1-NEXT:    callq __truncdfhf2
2416; AVX1-NEXT:    movw %ax, %bx
2417; AVX1-NEXT:    shll $16, %ebx
2418; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2419; AVX1-NEXT:    callq __truncdfhf2
2420; AVX1-NEXT:    movzwl %ax, %eax
2421; AVX1-NEXT:    orl %ebx, %eax
2422; AVX1-NEXT:    shlq $32, %rax
2423; AVX1-NEXT:    orq %r14, %rax
2424; AVX1-NEXT:    vmovq %rax, %xmm0
2425; AVX1-NEXT:    addq $40, %rsp
2426; AVX1-NEXT:    popq %rbx
2427; AVX1-NEXT:    popq %r14
2428; AVX1-NEXT:    retq
2429;
2430; AVX2-LABEL: cvt_4f64_to_4i16:
2431; AVX2:       # BB#0:
2432; AVX2-NEXT:    pushq %r14
2433; AVX2-NEXT:  .Ltmp3:
2434; AVX2-NEXT:    .cfi_def_cfa_offset 16
2435; AVX2-NEXT:    pushq %rbx
2436; AVX2-NEXT:  .Ltmp4:
2437; AVX2-NEXT:    .cfi_def_cfa_offset 24
2438; AVX2-NEXT:    subq $40, %rsp
2439; AVX2-NEXT:  .Ltmp5:
2440; AVX2-NEXT:    .cfi_def_cfa_offset 64
2441; AVX2-NEXT:  .Ltmp6:
2442; AVX2-NEXT:    .cfi_offset %rbx, -24
2443; AVX2-NEXT:  .Ltmp7:
2444; AVX2-NEXT:    .cfi_offset %r14, -16
2445; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
2446; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2447; AVX2-NEXT:    vzeroupper
2448; AVX2-NEXT:    callq __truncdfhf2
2449; AVX2-NEXT:    movw %ax, %bx
2450; AVX2-NEXT:    shll $16, %ebx
2451; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
2452; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2453; AVX2-NEXT:    vzeroupper
2454; AVX2-NEXT:    callq __truncdfhf2
2455; AVX2-NEXT:    movzwl %ax, %r14d
2456; AVX2-NEXT:    orl %ebx, %r14d
2457; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
2458; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
2459; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
2460; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2461; AVX2-NEXT:    vzeroupper
2462; AVX2-NEXT:    callq __truncdfhf2
2463; AVX2-NEXT:    movw %ax, %bx
2464; AVX2-NEXT:    shll $16, %ebx
2465; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2466; AVX2-NEXT:    callq __truncdfhf2
2467; AVX2-NEXT:    movzwl %ax, %eax
2468; AVX2-NEXT:    orl %ebx, %eax
2469; AVX2-NEXT:    shlq $32, %rax
2470; AVX2-NEXT:    orq %r14, %rax
2471; AVX2-NEXT:    vmovq %rax, %xmm0
2472; AVX2-NEXT:    addq $40, %rsp
2473; AVX2-NEXT:    popq %rbx
2474; AVX2-NEXT:    popq %r14
2475; AVX2-NEXT:    retq
2476;
2477; AVX512-LABEL: cvt_4f64_to_4i16:
2478; AVX512:       # BB#0:
2479; AVX512-NEXT:    pushq %r14
2480; AVX512-NEXT:  .Ltmp3:
2481; AVX512-NEXT:    .cfi_def_cfa_offset 16
2482; AVX512-NEXT:    pushq %rbx
2483; AVX512-NEXT:  .Ltmp4:
2484; AVX512-NEXT:    .cfi_def_cfa_offset 24
2485; AVX512-NEXT:    subq $40, %rsp
2486; AVX512-NEXT:  .Ltmp5:
2487; AVX512-NEXT:    .cfi_def_cfa_offset 64
2488; AVX512-NEXT:  .Ltmp6:
2489; AVX512-NEXT:    .cfi_offset %rbx, -24
2490; AVX512-NEXT:  .Ltmp7:
2491; AVX512-NEXT:    .cfi_offset %r14, -16
2492; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
2493; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2494; AVX512-NEXT:    callq __truncdfhf2
2495; AVX512-NEXT:    movw %ax, %bx
2496; AVX512-NEXT:    shll $16, %ebx
2497; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
2498; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2499; AVX512-NEXT:    callq __truncdfhf2
2500; AVX512-NEXT:    movzwl %ax, %r14d
2501; AVX512-NEXT:    orl %ebx, %r14d
2502; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
2503; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
2504; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
2505; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2506; AVX512-NEXT:    callq __truncdfhf2
2507; AVX512-NEXT:    movw %ax, %bx
2508; AVX512-NEXT:    shll $16, %ebx
2509; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2510; AVX512-NEXT:    callq __truncdfhf2
2511; AVX512-NEXT:    movzwl %ax, %eax
2512; AVX512-NEXT:    orl %ebx, %eax
2513; AVX512-NEXT:    shlq $32, %rax
2514; AVX512-NEXT:    orq %r14, %rax
2515; AVX512-NEXT:    vmovq %rax, %xmm0
2516; AVX512-NEXT:    addq $40, %rsp
2517; AVX512-NEXT:    popq %rbx
2518; AVX512-NEXT:    popq %r14
2519; AVX512-NEXT:    retq
2520  %1 = fptrunc <4 x double> %a0 to <4 x half>
2521  %2 = bitcast <4 x half> %1 to <4 x i16>
2522  ret <4 x i16> %2
2523}
2524
2525define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) {
2526; AVX1-LABEL: cvt_4f64_to_8i16_undef:
2527; AVX1:       # BB#0:
2528; AVX1-NEXT:    pushq %r14
2529; AVX1-NEXT:  .Ltmp8:
2530; AVX1-NEXT:    .cfi_def_cfa_offset 16
2531; AVX1-NEXT:    pushq %rbx
2532; AVX1-NEXT:  .Ltmp9:
2533; AVX1-NEXT:    .cfi_def_cfa_offset 24
2534; AVX1-NEXT:    subq $40, %rsp
2535; AVX1-NEXT:  .Ltmp10:
2536; AVX1-NEXT:    .cfi_def_cfa_offset 64
2537; AVX1-NEXT:  .Ltmp11:
2538; AVX1-NEXT:    .cfi_offset %rbx, -24
2539; AVX1-NEXT:  .Ltmp12:
2540; AVX1-NEXT:    .cfi_offset %r14, -16
2541; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
2542; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2543; AVX1-NEXT:    vzeroupper
2544; AVX1-NEXT:    callq __truncdfhf2
2545; AVX1-NEXT:    movw %ax, %bx
2546; AVX1-NEXT:    shll $16, %ebx
2547; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
2548; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2549; AVX1-NEXT:    vzeroupper
2550; AVX1-NEXT:    callq __truncdfhf2
2551; AVX1-NEXT:    movzwl %ax, %r14d
2552; AVX1-NEXT:    orl %ebx, %r14d
2553; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
2554; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2555; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
2556; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2557; AVX1-NEXT:    vzeroupper
2558; AVX1-NEXT:    callq __truncdfhf2
2559; AVX1-NEXT:    movw %ax, %bx
2560; AVX1-NEXT:    shll $16, %ebx
2561; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2562; AVX1-NEXT:    callq __truncdfhf2
2563; AVX1-NEXT:    movzwl %ax, %eax
2564; AVX1-NEXT:    orl %ebx, %eax
2565; AVX1-NEXT:    shlq $32, %rax
2566; AVX1-NEXT:    orq %r14, %rax
2567; AVX1-NEXT:    vmovq %rax, %xmm0
2568; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2569; AVX1-NEXT:    addq $40, %rsp
2570; AVX1-NEXT:    popq %rbx
2571; AVX1-NEXT:    popq %r14
2572; AVX1-NEXT:    retq
2573;
2574; AVX2-LABEL: cvt_4f64_to_8i16_undef:
2575; AVX2:       # BB#0:
2576; AVX2-NEXT:    pushq %r14
2577; AVX2-NEXT:  .Ltmp8:
2578; AVX2-NEXT:    .cfi_def_cfa_offset 16
2579; AVX2-NEXT:    pushq %rbx
2580; AVX2-NEXT:  .Ltmp9:
2581; AVX2-NEXT:    .cfi_def_cfa_offset 24
2582; AVX2-NEXT:    subq $40, %rsp
2583; AVX2-NEXT:  .Ltmp10:
2584; AVX2-NEXT:    .cfi_def_cfa_offset 64
2585; AVX2-NEXT:  .Ltmp11:
2586; AVX2-NEXT:    .cfi_offset %rbx, -24
2587; AVX2-NEXT:  .Ltmp12:
2588; AVX2-NEXT:    .cfi_offset %r14, -16
2589; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
2590; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2591; AVX2-NEXT:    vzeroupper
2592; AVX2-NEXT:    callq __truncdfhf2
2593; AVX2-NEXT:    movw %ax, %bx
2594; AVX2-NEXT:    shll $16, %ebx
2595; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
2596; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2597; AVX2-NEXT:    vzeroupper
2598; AVX2-NEXT:    callq __truncdfhf2
2599; AVX2-NEXT:    movzwl %ax, %r14d
2600; AVX2-NEXT:    orl %ebx, %r14d
2601; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
2602; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
2603; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
2604; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2605; AVX2-NEXT:    vzeroupper
2606; AVX2-NEXT:    callq __truncdfhf2
2607; AVX2-NEXT:    movw %ax, %bx
2608; AVX2-NEXT:    shll $16, %ebx
2609; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2610; AVX2-NEXT:    callq __truncdfhf2
2611; AVX2-NEXT:    movzwl %ax, %eax
2612; AVX2-NEXT:    orl %ebx, %eax
2613; AVX2-NEXT:    shlq $32, %rax
2614; AVX2-NEXT:    orq %r14, %rax
2615; AVX2-NEXT:    vmovq %rax, %xmm0
2616; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2617; AVX2-NEXT:    addq $40, %rsp
2618; AVX2-NEXT:    popq %rbx
2619; AVX2-NEXT:    popq %r14
2620; AVX2-NEXT:    retq
2621;
2622; AVX512-LABEL: cvt_4f64_to_8i16_undef:
2623; AVX512:       # BB#0:
2624; AVX512-NEXT:    pushq %r14
2625; AVX512-NEXT:  .Ltmp8:
2626; AVX512-NEXT:    .cfi_def_cfa_offset 16
2627; AVX512-NEXT:    pushq %rbx
2628; AVX512-NEXT:  .Ltmp9:
2629; AVX512-NEXT:    .cfi_def_cfa_offset 24
2630; AVX512-NEXT:    subq $40, %rsp
2631; AVX512-NEXT:  .Ltmp10:
2632; AVX512-NEXT:    .cfi_def_cfa_offset 64
2633; AVX512-NEXT:  .Ltmp11:
2634; AVX512-NEXT:    .cfi_offset %rbx, -24
2635; AVX512-NEXT:  .Ltmp12:
2636; AVX512-NEXT:    .cfi_offset %r14, -16
2637; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
2638; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2639; AVX512-NEXT:    callq __truncdfhf2
2640; AVX512-NEXT:    movw %ax, %bx
2641; AVX512-NEXT:    shll $16, %ebx
2642; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
2643; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2644; AVX512-NEXT:    callq __truncdfhf2
2645; AVX512-NEXT:    movzwl %ax, %r14d
2646; AVX512-NEXT:    orl %ebx, %r14d
2647; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
2648; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
2649; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
2650; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2651; AVX512-NEXT:    callq __truncdfhf2
2652; AVX512-NEXT:    movw %ax, %bx
2653; AVX512-NEXT:    shll $16, %ebx
2654; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2655; AVX512-NEXT:    callq __truncdfhf2
2656; AVX512-NEXT:    movzwl %ax, %eax
2657; AVX512-NEXT:    orl %ebx, %eax
2658; AVX512-NEXT:    shlq $32, %rax
2659; AVX512-NEXT:    orq %r14, %rax
2660; AVX512-NEXT:    vmovq %rax, %xmm0
2661; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2662; AVX512-NEXT:    addq $40, %rsp
2663; AVX512-NEXT:    popq %rbx
2664; AVX512-NEXT:    popq %r14
2665; AVX512-NEXT:    retq
2666  %1 = fptrunc <4 x double> %a0 to <4 x half>
2667  %2 = bitcast <4 x half> %1 to <4 x i16>
2668  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2669  ret <8 x i16> %3
2670}
2671
2672define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) {
2673; AVX1-LABEL: cvt_4f64_to_8i16_zero:
2674; AVX1:       # BB#0:
2675; AVX1-NEXT:    pushq %r14
2676; AVX1-NEXT:  .Ltmp13:
2677; AVX1-NEXT:    .cfi_def_cfa_offset 16
2678; AVX1-NEXT:    pushq %rbx
2679; AVX1-NEXT:  .Ltmp14:
2680; AVX1-NEXT:    .cfi_def_cfa_offset 24
2681; AVX1-NEXT:    subq $40, %rsp
2682; AVX1-NEXT:  .Ltmp15:
2683; AVX1-NEXT:    .cfi_def_cfa_offset 64
2684; AVX1-NEXT:  .Ltmp16:
2685; AVX1-NEXT:    .cfi_offset %rbx, -24
2686; AVX1-NEXT:  .Ltmp17:
2687; AVX1-NEXT:    .cfi_offset %r14, -16
2688; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
2689; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2690; AVX1-NEXT:    vzeroupper
2691; AVX1-NEXT:    callq __truncdfhf2
2692; AVX1-NEXT:    movw %ax, %bx
2693; AVX1-NEXT:    shll $16, %ebx
2694; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
2695; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2696; AVX1-NEXT:    vzeroupper
2697; AVX1-NEXT:    callq __truncdfhf2
2698; AVX1-NEXT:    movzwl %ax, %r14d
2699; AVX1-NEXT:    orl %ebx, %r14d
2700; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
2701; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2702; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
2703; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2704; AVX1-NEXT:    vzeroupper
2705; AVX1-NEXT:    callq __truncdfhf2
2706; AVX1-NEXT:    movw %ax, %bx
2707; AVX1-NEXT:    shll $16, %ebx
2708; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2709; AVX1-NEXT:    callq __truncdfhf2
2710; AVX1-NEXT:    movzwl %ax, %eax
2711; AVX1-NEXT:    orl %ebx, %eax
2712; AVX1-NEXT:    shlq $32, %rax
2713; AVX1-NEXT:    orq %r14, %rax
2714; AVX1-NEXT:    vmovq %rax, %xmm0
2715; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
2716; AVX1-NEXT:    addq $40, %rsp
2717; AVX1-NEXT:    popq %rbx
2718; AVX1-NEXT:    popq %r14
2719; AVX1-NEXT:    retq
2720;
2721; AVX2-LABEL: cvt_4f64_to_8i16_zero:
2722; AVX2:       # BB#0:
2723; AVX2-NEXT:    pushq %r14
2724; AVX2-NEXT:  .Ltmp13:
2725; AVX2-NEXT:    .cfi_def_cfa_offset 16
2726; AVX2-NEXT:    pushq %rbx
2727; AVX2-NEXT:  .Ltmp14:
2728; AVX2-NEXT:    .cfi_def_cfa_offset 24
2729; AVX2-NEXT:    subq $40, %rsp
2730; AVX2-NEXT:  .Ltmp15:
2731; AVX2-NEXT:    .cfi_def_cfa_offset 64
2732; AVX2-NEXT:  .Ltmp16:
2733; AVX2-NEXT:    .cfi_offset %rbx, -24
2734; AVX2-NEXT:  .Ltmp17:
2735; AVX2-NEXT:    .cfi_offset %r14, -16
2736; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
2737; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2738; AVX2-NEXT:    vzeroupper
2739; AVX2-NEXT:    callq __truncdfhf2
2740; AVX2-NEXT:    movw %ax, %bx
2741; AVX2-NEXT:    shll $16, %ebx
2742; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
2743; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2744; AVX2-NEXT:    vzeroupper
2745; AVX2-NEXT:    callq __truncdfhf2
2746; AVX2-NEXT:    movzwl %ax, %r14d
2747; AVX2-NEXT:    orl %ebx, %r14d
2748; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
2749; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
2750; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
2751; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2752; AVX2-NEXT:    vzeroupper
2753; AVX2-NEXT:    callq __truncdfhf2
2754; AVX2-NEXT:    movw %ax, %bx
2755; AVX2-NEXT:    shll $16, %ebx
2756; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2757; AVX2-NEXT:    callq __truncdfhf2
2758; AVX2-NEXT:    movzwl %ax, %eax
2759; AVX2-NEXT:    orl %ebx, %eax
2760; AVX2-NEXT:    shlq $32, %rax
2761; AVX2-NEXT:    orq %r14, %rax
2762; AVX2-NEXT:    vmovq %rax, %xmm0
2763; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
2764; AVX2-NEXT:    addq $40, %rsp
2765; AVX2-NEXT:    popq %rbx
2766; AVX2-NEXT:    popq %r14
2767; AVX2-NEXT:    retq
2768;
2769; AVX512-LABEL: cvt_4f64_to_8i16_zero:
2770; AVX512:       # BB#0:
2771; AVX512-NEXT:    pushq %r14
2772; AVX512-NEXT:  .Ltmp13:
2773; AVX512-NEXT:    .cfi_def_cfa_offset 16
2774; AVX512-NEXT:    pushq %rbx
2775; AVX512-NEXT:  .Ltmp14:
2776; AVX512-NEXT:    .cfi_def_cfa_offset 24
2777; AVX512-NEXT:    subq $40, %rsp
2778; AVX512-NEXT:  .Ltmp15:
2779; AVX512-NEXT:    .cfi_def_cfa_offset 64
2780; AVX512-NEXT:  .Ltmp16:
2781; AVX512-NEXT:    .cfi_offset %rbx, -24
2782; AVX512-NEXT:  .Ltmp17:
2783; AVX512-NEXT:    .cfi_offset %r14, -16
2784; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
2785; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2786; AVX512-NEXT:    callq __truncdfhf2
2787; AVX512-NEXT:    movw %ax, %bx
2788; AVX512-NEXT:    shll $16, %ebx
2789; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
2790; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2791; AVX512-NEXT:    callq __truncdfhf2
2792; AVX512-NEXT:    movzwl %ax, %r14d
2793; AVX512-NEXT:    orl %ebx, %r14d
2794; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
2795; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
2796; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
2797; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2798; AVX512-NEXT:    callq __truncdfhf2
2799; AVX512-NEXT:    movw %ax, %bx
2800; AVX512-NEXT:    shll $16, %ebx
2801; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2802; AVX512-NEXT:    callq __truncdfhf2
2803; AVX512-NEXT:    movzwl %ax, %eax
2804; AVX512-NEXT:    orl %ebx, %eax
2805; AVX512-NEXT:    shlq $32, %rax
2806; AVX512-NEXT:    orq %r14, %rax
2807; AVX512-NEXT:    vmovq %rax, %xmm0
2808; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
2809; AVX512-NEXT:    addq $40, %rsp
2810; AVX512-NEXT:    popq %rbx
2811; AVX512-NEXT:    popq %r14
2812; AVX512-NEXT:    retq
2813  %1 = fptrunc <4 x double> %a0 to <4 x half>
2814  %2 = bitcast <4 x half> %1 to <4 x i16>
2815  %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2816  ret <8 x i16> %3
2817}
2818
2819define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) {
2820; AVX1-LABEL: cvt_8f64_to_8i16:
2821; AVX1:       # BB#0:
2822; AVX1-NEXT:    pushq %r15
2823; AVX1-NEXT:  .Ltmp18:
2824; AVX1-NEXT:    .cfi_def_cfa_offset 16
2825; AVX1-NEXT:    pushq %r14
2826; AVX1-NEXT:  .Ltmp19:
2827; AVX1-NEXT:    .cfi_def_cfa_offset 24
2828; AVX1-NEXT:    pushq %rbx
2829; AVX1-NEXT:  .Ltmp20:
2830; AVX1-NEXT:    .cfi_def_cfa_offset 32
2831; AVX1-NEXT:    subq $64, %rsp
2832; AVX1-NEXT:  .Ltmp21:
2833; AVX1-NEXT:    .cfi_def_cfa_offset 96
2834; AVX1-NEXT:  .Ltmp22:
2835; AVX1-NEXT:    .cfi_offset %rbx, -32
2836; AVX1-NEXT:  .Ltmp23:
2837; AVX1-NEXT:    .cfi_offset %r14, -24
2838; AVX1-NEXT:  .Ltmp24:
2839; AVX1-NEXT:    .cfi_offset %r15, -16
2840; AVX1-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
2841; AVX1-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
2842; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2843; AVX1-NEXT:    vzeroupper
2844; AVX1-NEXT:    callq __truncdfhf2
2845; AVX1-NEXT:    movw %ax, %bx
2846; AVX1-NEXT:    shll $16, %ebx
2847; AVX1-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
2848; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2849; AVX1-NEXT:    vzeroupper
2850; AVX1-NEXT:    callq __truncdfhf2
2851; AVX1-NEXT:    movzwl %ax, %r15d
2852; AVX1-NEXT:    orl %ebx, %r15d
2853; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
2854; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2855; AVX1-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
2856; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2857; AVX1-NEXT:    vzeroupper
2858; AVX1-NEXT:    callq __truncdfhf2
2859; AVX1-NEXT:    movw %ax, %bx
2860; AVX1-NEXT:    shll $16, %ebx
2861; AVX1-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
2862; AVX1-NEXT:    callq __truncdfhf2
2863; AVX1-NEXT:    movzwl %ax, %r14d
2864; AVX1-NEXT:    orl %ebx, %r14d
2865; AVX1-NEXT:    shlq $32, %r14
2866; AVX1-NEXT:    orq %r15, %r14
2867; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
2868; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2869; AVX1-NEXT:    vzeroupper
2870; AVX1-NEXT:    callq __truncdfhf2
2871; AVX1-NEXT:    movw %ax, %bx
2872; AVX1-NEXT:    shll $16, %ebx
2873; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
2874; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2875; AVX1-NEXT:    vzeroupper
2876; AVX1-NEXT:    callq __truncdfhf2
2877; AVX1-NEXT:    movzwl %ax, %r15d
2878; AVX1-NEXT:    orl %ebx, %r15d
2879; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
2880; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2881; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
2882; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2883; AVX1-NEXT:    vzeroupper
2884; AVX1-NEXT:    callq __truncdfhf2
2885; AVX1-NEXT:    movw %ax, %bx
2886; AVX1-NEXT:    shll $16, %ebx
2887; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2888; AVX1-NEXT:    callq __truncdfhf2
2889; AVX1-NEXT:    movzwl %ax, %eax
2890; AVX1-NEXT:    orl %ebx, %eax
2891; AVX1-NEXT:    shlq $32, %rax
2892; AVX1-NEXT:    orq %r15, %rax
2893; AVX1-NEXT:    vmovq %rax, %xmm0
2894; AVX1-NEXT:    vmovq %r14, %xmm1
2895; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2896; AVX1-NEXT:    addq $64, %rsp
2897; AVX1-NEXT:    popq %rbx
2898; AVX1-NEXT:    popq %r14
2899; AVX1-NEXT:    popq %r15
2900; AVX1-NEXT:    retq
2901;
2902; AVX2-LABEL: cvt_8f64_to_8i16:
2903; AVX2:       # BB#0:
2904; AVX2-NEXT:    pushq %r15
2905; AVX2-NEXT:  .Ltmp18:
2906; AVX2-NEXT:    .cfi_def_cfa_offset 16
2907; AVX2-NEXT:    pushq %r14
2908; AVX2-NEXT:  .Ltmp19:
2909; AVX2-NEXT:    .cfi_def_cfa_offset 24
2910; AVX2-NEXT:    pushq %rbx
2911; AVX2-NEXT:  .Ltmp20:
2912; AVX2-NEXT:    .cfi_def_cfa_offset 32
2913; AVX2-NEXT:    subq $64, %rsp
2914; AVX2-NEXT:  .Ltmp21:
2915; AVX2-NEXT:    .cfi_def_cfa_offset 96
2916; AVX2-NEXT:  .Ltmp22:
2917; AVX2-NEXT:    .cfi_offset %rbx, -32
2918; AVX2-NEXT:  .Ltmp23:
2919; AVX2-NEXT:    .cfi_offset %r14, -24
2920; AVX2-NEXT:  .Ltmp24:
2921; AVX2-NEXT:    .cfi_offset %r15, -16
2922; AVX2-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
2923; AVX2-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
2924; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2925; AVX2-NEXT:    vzeroupper
2926; AVX2-NEXT:    callq __truncdfhf2
2927; AVX2-NEXT:    movw %ax, %bx
2928; AVX2-NEXT:    shll $16, %ebx
2929; AVX2-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
2930; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2931; AVX2-NEXT:    vzeroupper
2932; AVX2-NEXT:    callq __truncdfhf2
2933; AVX2-NEXT:    movzwl %ax, %r15d
2934; AVX2-NEXT:    orl %ebx, %r15d
2935; AVX2-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
2936; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
2937; AVX2-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
2938; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2939; AVX2-NEXT:    vzeroupper
2940; AVX2-NEXT:    callq __truncdfhf2
2941; AVX2-NEXT:    movw %ax, %bx
2942; AVX2-NEXT:    shll $16, %ebx
2943; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
2944; AVX2-NEXT:    callq __truncdfhf2
2945; AVX2-NEXT:    movzwl %ax, %r14d
2946; AVX2-NEXT:    orl %ebx, %r14d
2947; AVX2-NEXT:    shlq $32, %r14
2948; AVX2-NEXT:    orq %r15, %r14
2949; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
2950; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2951; AVX2-NEXT:    vzeroupper
2952; AVX2-NEXT:    callq __truncdfhf2
2953; AVX2-NEXT:    movw %ax, %bx
2954; AVX2-NEXT:    shll $16, %ebx
2955; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
2956; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2957; AVX2-NEXT:    vzeroupper
2958; AVX2-NEXT:    callq __truncdfhf2
2959; AVX2-NEXT:    movzwl %ax, %r15d
2960; AVX2-NEXT:    orl %ebx, %r15d
2961; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
2962; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
2963; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
2964; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2965; AVX2-NEXT:    vzeroupper
2966; AVX2-NEXT:    callq __truncdfhf2
2967; AVX2-NEXT:    movw %ax, %bx
2968; AVX2-NEXT:    shll $16, %ebx
2969; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2970; AVX2-NEXT:    callq __truncdfhf2
2971; AVX2-NEXT:    movzwl %ax, %eax
2972; AVX2-NEXT:    orl %ebx, %eax
2973; AVX2-NEXT:    shlq $32, %rax
2974; AVX2-NEXT:    orq %r15, %rax
2975; AVX2-NEXT:    vmovq %rax, %xmm0
2976; AVX2-NEXT:    vmovq %r14, %xmm1
2977; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2978; AVX2-NEXT:    addq $64, %rsp
2979; AVX2-NEXT:    popq %rbx
2980; AVX2-NEXT:    popq %r14
2981; AVX2-NEXT:    popq %r15
2982; AVX2-NEXT:    retq
2983;
2984; AVX512-LABEL: cvt_8f64_to_8i16:
2985; AVX512:       # BB#0:
2986; AVX512-NEXT:    pushq %r15
2987; AVX512-NEXT:  .Ltmp18:
2988; AVX512-NEXT:    .cfi_def_cfa_offset 16
2989; AVX512-NEXT:    pushq %r14
2990; AVX512-NEXT:  .Ltmp19:
2991; AVX512-NEXT:    .cfi_def_cfa_offset 24
2992; AVX512-NEXT:    pushq %rbx
2993; AVX512-NEXT:  .Ltmp20:
2994; AVX512-NEXT:    .cfi_def_cfa_offset 32
2995; AVX512-NEXT:    subq $96, %rsp
2996; AVX512-NEXT:  .Ltmp21:
2997; AVX512-NEXT:    .cfi_def_cfa_offset 128
2998; AVX512-NEXT:  .Ltmp22:
2999; AVX512-NEXT:    .cfi_offset %rbx, -32
3000; AVX512-NEXT:  .Ltmp23:
3001; AVX512-NEXT:    .cfi_offset %r14, -24
3002; AVX512-NEXT:  .Ltmp24:
3003; AVX512-NEXT:    .cfi_offset %r15, -16
3004; AVX512-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
3005; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3006; AVX512-NEXT:    callq __truncdfhf2
3007; AVX512-NEXT:    movw %ax, %bx
3008; AVX512-NEXT:    shll $16, %ebx
3009; AVX512-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
3010; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
3011; AVX512-NEXT:    callq __truncdfhf2
3012; AVX512-NEXT:    movzwl %ax, %r15d
3013; AVX512-NEXT:    orl %ebx, %r15d
3014; AVX512-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
3015; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3016; AVX512-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3017; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3018; AVX512-NEXT:    callq __truncdfhf2
3019; AVX512-NEXT:    movw %ax, %bx
3020; AVX512-NEXT:    shll $16, %ebx
3021; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3022; AVX512-NEXT:    callq __truncdfhf2
3023; AVX512-NEXT:    movzwl %ax, %r14d
3024; AVX512-NEXT:    orl %ebx, %r14d
3025; AVX512-NEXT:    shlq $32, %r14
3026; AVX512-NEXT:    orq %r15, %r14
3027; AVX512-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
3028; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
3029; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3030; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3031; AVX512-NEXT:    callq __truncdfhf2
3032; AVX512-NEXT:    movw %ax, %bx
3033; AVX512-NEXT:    shll $16, %ebx
3034; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3035; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3036; AVX512-NEXT:    callq __truncdfhf2
3037; AVX512-NEXT:    movzwl %ax, %r15d
3038; AVX512-NEXT:    orl %ebx, %r15d
3039; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3040; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3041; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3042; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3043; AVX512-NEXT:    callq __truncdfhf2
3044; AVX512-NEXT:    movw %ax, %bx
3045; AVX512-NEXT:    shll $16, %ebx
3046; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3047; AVX512-NEXT:    callq __truncdfhf2
3048; AVX512-NEXT:    movzwl %ax, %eax
3049; AVX512-NEXT:    orl %ebx, %eax
3050; AVX512-NEXT:    shlq $32, %rax
3051; AVX512-NEXT:    orq %r15, %rax
3052; AVX512-NEXT:    vmovq %rax, %xmm0
3053; AVX512-NEXT:    vmovq %r14, %xmm1
3054; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
3055; AVX512-NEXT:    addq $96, %rsp
3056; AVX512-NEXT:    popq %rbx
3057; AVX512-NEXT:    popq %r14
3058; AVX512-NEXT:    popq %r15
3059; AVX512-NEXT:    retq
3060  %1 = fptrunc <8 x double> %a0 to <8 x half>
3061  %2 = bitcast <8 x half> %1 to <8 x i16>
3062  ret <8 x i16> %2
3063}
3064
3065;
3066; Double to Half (Store)
3067;
3068
3069define void @store_cvt_f64_to_i16(double %a0, i16* %a1) {
3070; ALL-LABEL: store_cvt_f64_to_i16:
3071; ALL:       # BB#0:
3072; ALL-NEXT:    pushq %rbx
3073; ALL-NEXT:  .Ltmp25:
3074; ALL-NEXT:    .cfi_def_cfa_offset 16
3075; ALL-NEXT:  .Ltmp26:
3076; ALL-NEXT:    .cfi_offset %rbx, -16
3077; ALL-NEXT:    movq %rdi, %rbx
3078; ALL-NEXT:    callq __truncdfhf2
3079; ALL-NEXT:    movw %ax, (%rbx)
3080; ALL-NEXT:    popq %rbx
3081; ALL-NEXT:    retq
3082  %1 = fptrunc double %a0 to half
3083  %2 = bitcast half %1 to i16
3084  store i16 %2, i16* %a1
3085  ret void
3086}
3087
3088define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) {
3089; ALL-LABEL: store_cvt_2f64_to_2i16:
3090; ALL:       # BB#0:
3091; ALL-NEXT:    pushq %rbp
3092; ALL-NEXT:  .Ltmp27:
3093; ALL-NEXT:    .cfi_def_cfa_offset 16
3094; ALL-NEXT:    pushq %rbx
3095; ALL-NEXT:  .Ltmp28:
3096; ALL-NEXT:    .cfi_def_cfa_offset 24
3097; ALL-NEXT:    subq $24, %rsp
3098; ALL-NEXT:  .Ltmp29:
3099; ALL-NEXT:    .cfi_def_cfa_offset 48
3100; ALL-NEXT:  .Ltmp30:
3101; ALL-NEXT:    .cfi_offset %rbx, -24
3102; ALL-NEXT:  .Ltmp31:
3103; ALL-NEXT:    .cfi_offset %rbp, -16
3104; ALL-NEXT:    movq %rdi, %rbx
3105; ALL-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3106; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3107; ALL-NEXT:    callq __truncdfhf2
3108; ALL-NEXT:    movl %eax, %ebp
3109; ALL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3110; ALL-NEXT:    callq __truncdfhf2
3111; ALL-NEXT:    movw %ax, (%rbx)
3112; ALL-NEXT:    movw %bp, 2(%rbx)
3113; ALL-NEXT:    addq $24, %rsp
3114; ALL-NEXT:    popq %rbx
3115; ALL-NEXT:    popq %rbp
3116; ALL-NEXT:    retq
3117  %1 = fptrunc <2 x double> %a0 to <2 x half>
3118  %2 = bitcast <2 x half> %1 to <2 x i16>
3119  store <2 x i16> %2, <2 x i16>* %a1
3120  ret void
3121}
3122
3123define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) {
3124; AVX1-LABEL: store_cvt_4f64_to_4i16:
3125; AVX1:       # BB#0:
3126; AVX1-NEXT:    pushq %rbp
3127; AVX1-NEXT:  .Ltmp32:
3128; AVX1-NEXT:    .cfi_def_cfa_offset 16
3129; AVX1-NEXT:    pushq %r15
3130; AVX1-NEXT:  .Ltmp33:
3131; AVX1-NEXT:    .cfi_def_cfa_offset 24
3132; AVX1-NEXT:    pushq %r14
3133; AVX1-NEXT:  .Ltmp34:
3134; AVX1-NEXT:    .cfi_def_cfa_offset 32
3135; AVX1-NEXT:    pushq %rbx
3136; AVX1-NEXT:  .Ltmp35:
3137; AVX1-NEXT:    .cfi_def_cfa_offset 40
3138; AVX1-NEXT:    subq $88, %rsp
3139; AVX1-NEXT:  .Ltmp36:
3140; AVX1-NEXT:    .cfi_def_cfa_offset 128
3141; AVX1-NEXT:  .Ltmp37:
3142; AVX1-NEXT:    .cfi_offset %rbx, -40
3143; AVX1-NEXT:  .Ltmp38:
3144; AVX1-NEXT:    .cfi_offset %r14, -32
3145; AVX1-NEXT:  .Ltmp39:
3146; AVX1-NEXT:    .cfi_offset %r15, -24
3147; AVX1-NEXT:  .Ltmp40:
3148; AVX1-NEXT:    .cfi_offset %rbp, -16
3149; AVX1-NEXT:    movq %rdi, %rbx
3150; AVX1-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
3151; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3152; AVX1-NEXT:    vzeroupper
3153; AVX1-NEXT:    callq __truncdfhf2
3154; AVX1-NEXT:    movl %eax, %r14d
3155; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3156; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3157; AVX1-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3158; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3159; AVX1-NEXT:    vzeroupper
3160; AVX1-NEXT:    callq __truncdfhf2
3161; AVX1-NEXT:    movl %eax, %r15d
3162; AVX1-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3163; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3164; AVX1-NEXT:    vzeroupper
3165; AVX1-NEXT:    callq __truncdfhf2
3166; AVX1-NEXT:    movl %eax, %ebp
3167; AVX1-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3168; AVX1-NEXT:    callq __truncdfhf2
3169; AVX1-NEXT:    movw %ax, 4(%rbx)
3170; AVX1-NEXT:    movw %bp, (%rbx)
3171; AVX1-NEXT:    movw %r15w, 6(%rbx)
3172; AVX1-NEXT:    movw %r14w, 2(%rbx)
3173; AVX1-NEXT:    addq $88, %rsp
3174; AVX1-NEXT:    popq %rbx
3175; AVX1-NEXT:    popq %r14
3176; AVX1-NEXT:    popq %r15
3177; AVX1-NEXT:    popq %rbp
3178; AVX1-NEXT:    retq
3179;
3180; AVX2-LABEL: store_cvt_4f64_to_4i16:
3181; AVX2:       # BB#0:
3182; AVX2-NEXT:    pushq %rbp
3183; AVX2-NEXT:  .Ltmp32:
3184; AVX2-NEXT:    .cfi_def_cfa_offset 16
3185; AVX2-NEXT:    pushq %r15
3186; AVX2-NEXT:  .Ltmp33:
3187; AVX2-NEXT:    .cfi_def_cfa_offset 24
3188; AVX2-NEXT:    pushq %r14
3189; AVX2-NEXT:  .Ltmp34:
3190; AVX2-NEXT:    .cfi_def_cfa_offset 32
3191; AVX2-NEXT:    pushq %rbx
3192; AVX2-NEXT:  .Ltmp35:
3193; AVX2-NEXT:    .cfi_def_cfa_offset 40
3194; AVX2-NEXT:    subq $88, %rsp
3195; AVX2-NEXT:  .Ltmp36:
3196; AVX2-NEXT:    .cfi_def_cfa_offset 128
3197; AVX2-NEXT:  .Ltmp37:
3198; AVX2-NEXT:    .cfi_offset %rbx, -40
3199; AVX2-NEXT:  .Ltmp38:
3200; AVX2-NEXT:    .cfi_offset %r14, -32
3201; AVX2-NEXT:  .Ltmp39:
3202; AVX2-NEXT:    .cfi_offset %r15, -24
3203; AVX2-NEXT:  .Ltmp40:
3204; AVX2-NEXT:    .cfi_offset %rbp, -16
3205; AVX2-NEXT:    movq %rdi, %rbx
3206; AVX2-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
3207; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3208; AVX2-NEXT:    vzeroupper
3209; AVX2-NEXT:    callq __truncdfhf2
3210; AVX2-NEXT:    movl %eax, %r14d
3211; AVX2-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3212; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3213; AVX2-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3214; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3215; AVX2-NEXT:    vzeroupper
3216; AVX2-NEXT:    callq __truncdfhf2
3217; AVX2-NEXT:    movl %eax, %r15d
3218; AVX2-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3219; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3220; AVX2-NEXT:    vzeroupper
3221; AVX2-NEXT:    callq __truncdfhf2
3222; AVX2-NEXT:    movl %eax, %ebp
3223; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3224; AVX2-NEXT:    callq __truncdfhf2
3225; AVX2-NEXT:    movw %ax, 4(%rbx)
3226; AVX2-NEXT:    movw %bp, (%rbx)
3227; AVX2-NEXT:    movw %r15w, 6(%rbx)
3228; AVX2-NEXT:    movw %r14w, 2(%rbx)
3229; AVX2-NEXT:    addq $88, %rsp
3230; AVX2-NEXT:    popq %rbx
3231; AVX2-NEXT:    popq %r14
3232; AVX2-NEXT:    popq %r15
3233; AVX2-NEXT:    popq %rbp
3234; AVX2-NEXT:    retq
3235;
3236; AVX512-LABEL: store_cvt_4f64_to_4i16:
3237; AVX512:       # BB#0:
3238; AVX512-NEXT:    pushq %rbp
3239; AVX512-NEXT:  .Ltmp32:
3240; AVX512-NEXT:    .cfi_def_cfa_offset 16
3241; AVX512-NEXT:    pushq %r15
3242; AVX512-NEXT:  .Ltmp33:
3243; AVX512-NEXT:    .cfi_def_cfa_offset 24
3244; AVX512-NEXT:    pushq %r14
3245; AVX512-NEXT:  .Ltmp34:
3246; AVX512-NEXT:    .cfi_def_cfa_offset 32
3247; AVX512-NEXT:    pushq %rbx
3248; AVX512-NEXT:  .Ltmp35:
3249; AVX512-NEXT:    .cfi_def_cfa_offset 40
3250; AVX512-NEXT:    subq $88, %rsp
3251; AVX512-NEXT:  .Ltmp36:
3252; AVX512-NEXT:    .cfi_def_cfa_offset 128
3253; AVX512-NEXT:  .Ltmp37:
3254; AVX512-NEXT:    .cfi_offset %rbx, -40
3255; AVX512-NEXT:  .Ltmp38:
3256; AVX512-NEXT:    .cfi_offset %r14, -32
3257; AVX512-NEXT:  .Ltmp39:
3258; AVX512-NEXT:    .cfi_offset %r15, -24
3259; AVX512-NEXT:  .Ltmp40:
3260; AVX512-NEXT:    .cfi_offset %rbp, -16
3261; AVX512-NEXT:    movq %rdi, %rbx
3262; AVX512-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
3263; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3264; AVX512-NEXT:    callq __truncdfhf2
3265; AVX512-NEXT:    movl %eax, %r14d
3266; AVX512-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3267; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3268; AVX512-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3269; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3270; AVX512-NEXT:    callq __truncdfhf2
3271; AVX512-NEXT:    movl %eax, %r15d
3272; AVX512-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3273; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3274; AVX512-NEXT:    callq __truncdfhf2
3275; AVX512-NEXT:    movl %eax, %ebp
3276; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3277; AVX512-NEXT:    callq __truncdfhf2
3278; AVX512-NEXT:    movw %ax, 4(%rbx)
3279; AVX512-NEXT:    movw %bp, (%rbx)
3280; AVX512-NEXT:    movw %r15w, 6(%rbx)
3281; AVX512-NEXT:    movw %r14w, 2(%rbx)
3282; AVX512-NEXT:    addq $88, %rsp
3283; AVX512-NEXT:    popq %rbx
3284; AVX512-NEXT:    popq %r14
3285; AVX512-NEXT:    popq %r15
3286; AVX512-NEXT:    popq %rbp
3287; AVX512-NEXT:    retq
3288  %1 = fptrunc <4 x double> %a0 to <4 x half>
3289  %2 = bitcast <4 x half> %1 to <4 x i16>
3290  store <4 x i16> %2, <4 x i16>* %a1
3291  ret void
3292}
3293
3294define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) {
3295; AVX1-LABEL: store_cvt_4f64_to_8i16_undef:
3296; AVX1:       # BB#0:
3297; AVX1-NEXT:    pushq %rbp
3298; AVX1-NEXT:  .Ltmp41:
3299; AVX1-NEXT:    .cfi_def_cfa_offset 16
3300; AVX1-NEXT:    pushq %r14
3301; AVX1-NEXT:  .Ltmp42:
3302; AVX1-NEXT:    .cfi_def_cfa_offset 24
3303; AVX1-NEXT:    pushq %rbx
3304; AVX1-NEXT:  .Ltmp43:
3305; AVX1-NEXT:    .cfi_def_cfa_offset 32
3306; AVX1-NEXT:    subq $32, %rsp
3307; AVX1-NEXT:  .Ltmp44:
3308; AVX1-NEXT:    .cfi_def_cfa_offset 64
3309; AVX1-NEXT:  .Ltmp45:
3310; AVX1-NEXT:    .cfi_offset %rbx, -32
3311; AVX1-NEXT:  .Ltmp46:
3312; AVX1-NEXT:    .cfi_offset %r14, -24
3313; AVX1-NEXT:  .Ltmp47:
3314; AVX1-NEXT:    .cfi_offset %rbp, -16
3315; AVX1-NEXT:    movq %rdi, %r14
3316; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3317; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3318; AVX1-NEXT:    vzeroupper
3319; AVX1-NEXT:    callq __truncdfhf2
3320; AVX1-NEXT:    movw %ax, %bp
3321; AVX1-NEXT:    shll $16, %ebp
3322; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3323; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3324; AVX1-NEXT:    vzeroupper
3325; AVX1-NEXT:    callq __truncdfhf2
3326; AVX1-NEXT:    movzwl %ax, %ebx
3327; AVX1-NEXT:    orl %ebp, %ebx
3328; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3329; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3330; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3331; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3332; AVX1-NEXT:    vzeroupper
3333; AVX1-NEXT:    callq __truncdfhf2
3334; AVX1-NEXT:    movw %ax, %bp
3335; AVX1-NEXT:    shll $16, %ebp
3336; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3337; AVX1-NEXT:    callq __truncdfhf2
3338; AVX1-NEXT:    movzwl %ax, %eax
3339; AVX1-NEXT:    orl %ebp, %eax
3340; AVX1-NEXT:    shlq $32, %rax
3341; AVX1-NEXT:    orq %rbx, %rax
3342; AVX1-NEXT:    vmovq %rax, %xmm0
3343; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3344; AVX1-NEXT:    vmovdqa %xmm0, (%r14)
3345; AVX1-NEXT:    addq $32, %rsp
3346; AVX1-NEXT:    popq %rbx
3347; AVX1-NEXT:    popq %r14
3348; AVX1-NEXT:    popq %rbp
3349; AVX1-NEXT:    retq
3350;
3351; AVX2-LABEL: store_cvt_4f64_to_8i16_undef:
3352; AVX2:       # BB#0:
3353; AVX2-NEXT:    pushq %rbp
3354; AVX2-NEXT:  .Ltmp41:
3355; AVX2-NEXT:    .cfi_def_cfa_offset 16
3356; AVX2-NEXT:    pushq %r14
3357; AVX2-NEXT:  .Ltmp42:
3358; AVX2-NEXT:    .cfi_def_cfa_offset 24
3359; AVX2-NEXT:    pushq %rbx
3360; AVX2-NEXT:  .Ltmp43:
3361; AVX2-NEXT:    .cfi_def_cfa_offset 32
3362; AVX2-NEXT:    subq $32, %rsp
3363; AVX2-NEXT:  .Ltmp44:
3364; AVX2-NEXT:    .cfi_def_cfa_offset 64
3365; AVX2-NEXT:  .Ltmp45:
3366; AVX2-NEXT:    .cfi_offset %rbx, -32
3367; AVX2-NEXT:  .Ltmp46:
3368; AVX2-NEXT:    .cfi_offset %r14, -24
3369; AVX2-NEXT:  .Ltmp47:
3370; AVX2-NEXT:    .cfi_offset %rbp, -16
3371; AVX2-NEXT:    movq %rdi, %r14
3372; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3373; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3374; AVX2-NEXT:    vzeroupper
3375; AVX2-NEXT:    callq __truncdfhf2
3376; AVX2-NEXT:    movw %ax, %bp
3377; AVX2-NEXT:    shll $16, %ebp
3378; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3379; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3380; AVX2-NEXT:    vzeroupper
3381; AVX2-NEXT:    callq __truncdfhf2
3382; AVX2-NEXT:    movzwl %ax, %ebx
3383; AVX2-NEXT:    orl %ebp, %ebx
3384; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3385; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3386; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3387; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3388; AVX2-NEXT:    vzeroupper
3389; AVX2-NEXT:    callq __truncdfhf2
3390; AVX2-NEXT:    movw %ax, %bp
3391; AVX2-NEXT:    shll $16, %ebp
3392; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3393; AVX2-NEXT:    callq __truncdfhf2
3394; AVX2-NEXT:    movzwl %ax, %eax
3395; AVX2-NEXT:    orl %ebp, %eax
3396; AVX2-NEXT:    shlq $32, %rax
3397; AVX2-NEXT:    orq %rbx, %rax
3398; AVX2-NEXT:    vmovq %rax, %xmm0
3399; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3400; AVX2-NEXT:    vmovdqa %xmm0, (%r14)
3401; AVX2-NEXT:    addq $32, %rsp
3402; AVX2-NEXT:    popq %rbx
3403; AVX2-NEXT:    popq %r14
3404; AVX2-NEXT:    popq %rbp
3405; AVX2-NEXT:    retq
3406;
3407; AVX512-LABEL: store_cvt_4f64_to_8i16_undef:
3408; AVX512:       # BB#0:
3409; AVX512-NEXT:    pushq %rbp
3410; AVX512-NEXT:  .Ltmp41:
3411; AVX512-NEXT:    .cfi_def_cfa_offset 16
3412; AVX512-NEXT:    pushq %r14
3413; AVX512-NEXT:  .Ltmp42:
3414; AVX512-NEXT:    .cfi_def_cfa_offset 24
3415; AVX512-NEXT:    pushq %rbx
3416; AVX512-NEXT:  .Ltmp43:
3417; AVX512-NEXT:    .cfi_def_cfa_offset 32
3418; AVX512-NEXT:    subq $32, %rsp
3419; AVX512-NEXT:  .Ltmp44:
3420; AVX512-NEXT:    .cfi_def_cfa_offset 64
3421; AVX512-NEXT:  .Ltmp45:
3422; AVX512-NEXT:    .cfi_offset %rbx, -32
3423; AVX512-NEXT:  .Ltmp46:
3424; AVX512-NEXT:    .cfi_offset %r14, -24
3425; AVX512-NEXT:  .Ltmp47:
3426; AVX512-NEXT:    .cfi_offset %rbp, -16
3427; AVX512-NEXT:    movq %rdi, %r14
3428; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3429; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3430; AVX512-NEXT:    callq __truncdfhf2
3431; AVX512-NEXT:    movw %ax, %bp
3432; AVX512-NEXT:    shll $16, %ebp
3433; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3434; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3435; AVX512-NEXT:    callq __truncdfhf2
3436; AVX512-NEXT:    movzwl %ax, %ebx
3437; AVX512-NEXT:    orl %ebp, %ebx
3438; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3439; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3440; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3441; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3442; AVX512-NEXT:    callq __truncdfhf2
3443; AVX512-NEXT:    movw %ax, %bp
3444; AVX512-NEXT:    shll $16, %ebp
3445; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3446; AVX512-NEXT:    callq __truncdfhf2
3447; AVX512-NEXT:    movzwl %ax, %eax
3448; AVX512-NEXT:    orl %ebp, %eax
3449; AVX512-NEXT:    shlq $32, %rax
3450; AVX512-NEXT:    orq %rbx, %rax
3451; AVX512-NEXT:    vmovq %rax, %xmm0
3452; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3453; AVX512-NEXT:    vmovdqa %xmm0, (%r14)
3454; AVX512-NEXT:    addq $32, %rsp
3455; AVX512-NEXT:    popq %rbx
3456; AVX512-NEXT:    popq %r14
3457; AVX512-NEXT:    popq %rbp
3458; AVX512-NEXT:    retq
3459  %1 = fptrunc <4 x double> %a0 to <4 x half>
3460  %2 = bitcast <4 x half> %1 to <4 x i16>
3461  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3462  store <8 x i16> %3, <8 x i16>* %a1
3463  ret void
3464}
3465
3466define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) {
3467; AVX1-LABEL: store_cvt_4f64_to_8i16_zero:
3468; AVX1:       # BB#0:
3469; AVX1-NEXT:    pushq %rbp
3470; AVX1-NEXT:  .Ltmp48:
3471; AVX1-NEXT:    .cfi_def_cfa_offset 16
3472; AVX1-NEXT:    pushq %r14
3473; AVX1-NEXT:  .Ltmp49:
3474; AVX1-NEXT:    .cfi_def_cfa_offset 24
3475; AVX1-NEXT:    pushq %rbx
3476; AVX1-NEXT:  .Ltmp50:
3477; AVX1-NEXT:    .cfi_def_cfa_offset 32
3478; AVX1-NEXT:    subq $32, %rsp
3479; AVX1-NEXT:  .Ltmp51:
3480; AVX1-NEXT:    .cfi_def_cfa_offset 64
3481; AVX1-NEXT:  .Ltmp52:
3482; AVX1-NEXT:    .cfi_offset %rbx, -32
3483; AVX1-NEXT:  .Ltmp53:
3484; AVX1-NEXT:    .cfi_offset %r14, -24
3485; AVX1-NEXT:  .Ltmp54:
3486; AVX1-NEXT:    .cfi_offset %rbp, -16
3487; AVX1-NEXT:    movq %rdi, %r14
3488; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3489; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3490; AVX1-NEXT:    vzeroupper
3491; AVX1-NEXT:    callq __truncdfhf2
3492; AVX1-NEXT:    movw %ax, %bp
3493; AVX1-NEXT:    shll $16, %ebp
3494; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3495; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3496; AVX1-NEXT:    vzeroupper
3497; AVX1-NEXT:    callq __truncdfhf2
3498; AVX1-NEXT:    movzwl %ax, %ebx
3499; AVX1-NEXT:    orl %ebp, %ebx
3500; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3501; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3502; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3503; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3504; AVX1-NEXT:    vzeroupper
3505; AVX1-NEXT:    callq __truncdfhf2
3506; AVX1-NEXT:    movw %ax, %bp
3507; AVX1-NEXT:    shll $16, %ebp
3508; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3509; AVX1-NEXT:    callq __truncdfhf2
3510; AVX1-NEXT:    movzwl %ax, %eax
3511; AVX1-NEXT:    orl %ebp, %eax
3512; AVX1-NEXT:    shlq $32, %rax
3513; AVX1-NEXT:    orq %rbx, %rax
3514; AVX1-NEXT:    vmovq %rax, %xmm0
3515; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
3516; AVX1-NEXT:    vmovdqa %xmm0, (%r14)
3517; AVX1-NEXT:    addq $32, %rsp
3518; AVX1-NEXT:    popq %rbx
3519; AVX1-NEXT:    popq %r14
3520; AVX1-NEXT:    popq %rbp
3521; AVX1-NEXT:    retq
3522;
3523; AVX2-LABEL: store_cvt_4f64_to_8i16_zero:
3524; AVX2:       # BB#0:
3525; AVX2-NEXT:    pushq %rbp
3526; AVX2-NEXT:  .Ltmp48:
3527; AVX2-NEXT:    .cfi_def_cfa_offset 16
3528; AVX2-NEXT:    pushq %r14
3529; AVX2-NEXT:  .Ltmp49:
3530; AVX2-NEXT:    .cfi_def_cfa_offset 24
3531; AVX2-NEXT:    pushq %rbx
3532; AVX2-NEXT:  .Ltmp50:
3533; AVX2-NEXT:    .cfi_def_cfa_offset 32
3534; AVX2-NEXT:    subq $32, %rsp
3535; AVX2-NEXT:  .Ltmp51:
3536; AVX2-NEXT:    .cfi_def_cfa_offset 64
3537; AVX2-NEXT:  .Ltmp52:
3538; AVX2-NEXT:    .cfi_offset %rbx, -32
3539; AVX2-NEXT:  .Ltmp53:
3540; AVX2-NEXT:    .cfi_offset %r14, -24
3541; AVX2-NEXT:  .Ltmp54:
3542; AVX2-NEXT:    .cfi_offset %rbp, -16
3543; AVX2-NEXT:    movq %rdi, %r14
3544; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3545; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3546; AVX2-NEXT:    vzeroupper
3547; AVX2-NEXT:    callq __truncdfhf2
3548; AVX2-NEXT:    movw %ax, %bp
3549; AVX2-NEXT:    shll $16, %ebp
3550; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3551; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3552; AVX2-NEXT:    vzeroupper
3553; AVX2-NEXT:    callq __truncdfhf2
3554; AVX2-NEXT:    movzwl %ax, %ebx
3555; AVX2-NEXT:    orl %ebp, %ebx
3556; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3557; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3558; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3559; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3560; AVX2-NEXT:    vzeroupper
3561; AVX2-NEXT:    callq __truncdfhf2
3562; AVX2-NEXT:    movw %ax, %bp
3563; AVX2-NEXT:    shll $16, %ebp
3564; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3565; AVX2-NEXT:    callq __truncdfhf2
3566; AVX2-NEXT:    movzwl %ax, %eax
3567; AVX2-NEXT:    orl %ebp, %eax
3568; AVX2-NEXT:    shlq $32, %rax
3569; AVX2-NEXT:    orq %rbx, %rax
3570; AVX2-NEXT:    vmovq %rax, %xmm0
3571; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
3572; AVX2-NEXT:    vmovdqa %xmm0, (%r14)
3573; AVX2-NEXT:    addq $32, %rsp
3574; AVX2-NEXT:    popq %rbx
3575; AVX2-NEXT:    popq %r14
3576; AVX2-NEXT:    popq %rbp
3577; AVX2-NEXT:    retq
3578;
3579; AVX512-LABEL: store_cvt_4f64_to_8i16_zero:
3580; AVX512:       # BB#0:
3581; AVX512-NEXT:    pushq %rbp
3582; AVX512-NEXT:  .Ltmp48:
3583; AVX512-NEXT:    .cfi_def_cfa_offset 16
3584; AVX512-NEXT:    pushq %r14
3585; AVX512-NEXT:  .Ltmp49:
3586; AVX512-NEXT:    .cfi_def_cfa_offset 24
3587; AVX512-NEXT:    pushq %rbx
3588; AVX512-NEXT:  .Ltmp50:
3589; AVX512-NEXT:    .cfi_def_cfa_offset 32
3590; AVX512-NEXT:    subq $32, %rsp
3591; AVX512-NEXT:  .Ltmp51:
3592; AVX512-NEXT:    .cfi_def_cfa_offset 64
3593; AVX512-NEXT:  .Ltmp52:
3594; AVX512-NEXT:    .cfi_offset %rbx, -32
3595; AVX512-NEXT:  .Ltmp53:
3596; AVX512-NEXT:    .cfi_offset %r14, -24
3597; AVX512-NEXT:  .Ltmp54:
3598; AVX512-NEXT:    .cfi_offset %rbp, -16
3599; AVX512-NEXT:    movq %rdi, %r14
3600; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3601; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3602; AVX512-NEXT:    callq __truncdfhf2
3603; AVX512-NEXT:    movw %ax, %bp
3604; AVX512-NEXT:    shll $16, %ebp
3605; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3606; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3607; AVX512-NEXT:    callq __truncdfhf2
3608; AVX512-NEXT:    movzwl %ax, %ebx
3609; AVX512-NEXT:    orl %ebp, %ebx
3610; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3611; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3612; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3613; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3614; AVX512-NEXT:    callq __truncdfhf2
3615; AVX512-NEXT:    movw %ax, %bp
3616; AVX512-NEXT:    shll $16, %ebp
3617; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3618; AVX512-NEXT:    callq __truncdfhf2
3619; AVX512-NEXT:    movzwl %ax, %eax
3620; AVX512-NEXT:    orl %ebp, %eax
3621; AVX512-NEXT:    shlq $32, %rax
3622; AVX512-NEXT:    orq %rbx, %rax
3623; AVX512-NEXT:    vmovq %rax, %xmm0
3624; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
3625; AVX512-NEXT:    vmovdqa %xmm0, (%r14)
3626; AVX512-NEXT:    addq $32, %rsp
3627; AVX512-NEXT:    popq %rbx
3628; AVX512-NEXT:    popq %r14
3629; AVX512-NEXT:    popq %rbp
3630; AVX512-NEXT:    retq
3631  %1 = fptrunc <4 x double> %a0 to <4 x half>
3632  %2 = bitcast <4 x half> %1 to <4 x i16>
3633  %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3634  store <8 x i16> %3, <8 x i16>* %a1
3635  ret void
3636}
3637
3638define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) {
3639; AVX1-LABEL: store_cvt_8f64_to_8i16:
3640; AVX1:       # BB#0:
3641; AVX1-NEXT:    pushq %rbp
3642; AVX1-NEXT:  .Ltmp55:
3643; AVX1-NEXT:    .cfi_def_cfa_offset 16
3644; AVX1-NEXT:    pushq %r15
3645; AVX1-NEXT:  .Ltmp56:
3646; AVX1-NEXT:    .cfi_def_cfa_offset 24
3647; AVX1-NEXT:    pushq %r14
3648; AVX1-NEXT:  .Ltmp57:
3649; AVX1-NEXT:    .cfi_def_cfa_offset 32
3650; AVX1-NEXT:    pushq %r13
3651; AVX1-NEXT:  .Ltmp58:
3652; AVX1-NEXT:    .cfi_def_cfa_offset 40
3653; AVX1-NEXT:    pushq %r12
3654; AVX1-NEXT:  .Ltmp59:
3655; AVX1-NEXT:    .cfi_def_cfa_offset 48
3656; AVX1-NEXT:    pushq %rbx
3657; AVX1-NEXT:  .Ltmp60:
3658; AVX1-NEXT:    .cfi_def_cfa_offset 56
3659; AVX1-NEXT:    subq $136, %rsp
3660; AVX1-NEXT:  .Ltmp61:
3661; AVX1-NEXT:    .cfi_def_cfa_offset 192
3662; AVX1-NEXT:  .Ltmp62:
3663; AVX1-NEXT:    .cfi_offset %rbx, -56
3664; AVX1-NEXT:  .Ltmp63:
3665; AVX1-NEXT:    .cfi_offset %r12, -48
3666; AVX1-NEXT:  .Ltmp64:
3667; AVX1-NEXT:    .cfi_offset %r13, -40
3668; AVX1-NEXT:  .Ltmp65:
3669; AVX1-NEXT:    .cfi_offset %r14, -32
3670; AVX1-NEXT:  .Ltmp66:
3671; AVX1-NEXT:    .cfi_offset %r15, -24
3672; AVX1-NEXT:  .Ltmp67:
3673; AVX1-NEXT:    .cfi_offset %rbp, -16
3674; AVX1-NEXT:    movq %rdi, %rbx
3675; AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
3676; AVX1-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
3677; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3678; AVX1-NEXT:    vzeroupper
3679; AVX1-NEXT:    callq __truncdfhf2
3680; AVX1-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
3681; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3682; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3683; AVX1-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3684; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3685; AVX1-NEXT:    vzeroupper
3686; AVX1-NEXT:    callq __truncdfhf2
3687; AVX1-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
3688; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3689; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3690; AVX1-NEXT:    vzeroupper
3691; AVX1-NEXT:    callq __truncdfhf2
3692; AVX1-NEXT:    movl %eax, %r12d
3693; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3694; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3695; AVX1-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3696; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3697; AVX1-NEXT:    vzeroupper
3698; AVX1-NEXT:    callq __truncdfhf2
3699; AVX1-NEXT:    movl %eax, %r13d
3700; AVX1-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3701; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3702; AVX1-NEXT:    vzeroupper
3703; AVX1-NEXT:    callq __truncdfhf2
3704; AVX1-NEXT:    movl %eax, %ebp
3705; AVX1-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3706; AVX1-NEXT:    callq __truncdfhf2
3707; AVX1-NEXT:    movl %eax, %r14d
3708; AVX1-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3709; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3710; AVX1-NEXT:    vzeroupper
3711; AVX1-NEXT:    callq __truncdfhf2
3712; AVX1-NEXT:    movl %eax, %r15d
3713; AVX1-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3714; AVX1-NEXT:    callq __truncdfhf2
3715; AVX1-NEXT:    movw %ax, 12(%rbx)
3716; AVX1-NEXT:    movw %r15w, 8(%rbx)
3717; AVX1-NEXT:    movw %r14w, 4(%rbx)
3718; AVX1-NEXT:    movw %bp, (%rbx)
3719; AVX1-NEXT:    movw %r13w, 14(%rbx)
3720; AVX1-NEXT:    movw %r12w, 10(%rbx)
3721; AVX1-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
3722; AVX1-NEXT:    movw %ax, 6(%rbx)
3723; AVX1-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
3724; AVX1-NEXT:    movw %ax, 2(%rbx)
3725; AVX1-NEXT:    addq $136, %rsp
3726; AVX1-NEXT:    popq %rbx
3727; AVX1-NEXT:    popq %r12
3728; AVX1-NEXT:    popq %r13
3729; AVX1-NEXT:    popq %r14
3730; AVX1-NEXT:    popq %r15
3731; AVX1-NEXT:    popq %rbp
3732; AVX1-NEXT:    retq
3733;
3734; AVX2-LABEL: store_cvt_8f64_to_8i16:
3735; AVX2:       # BB#0:
3736; AVX2-NEXT:    pushq %rbp
3737; AVX2-NEXT:  .Ltmp55:
3738; AVX2-NEXT:    .cfi_def_cfa_offset 16
3739; AVX2-NEXT:    pushq %r15
3740; AVX2-NEXT:  .Ltmp56:
3741; AVX2-NEXT:    .cfi_def_cfa_offset 24
3742; AVX2-NEXT:    pushq %r14
3743; AVX2-NEXT:  .Ltmp57:
3744; AVX2-NEXT:    .cfi_def_cfa_offset 32
3745; AVX2-NEXT:    pushq %r13
3746; AVX2-NEXT:  .Ltmp58:
3747; AVX2-NEXT:    .cfi_def_cfa_offset 40
3748; AVX2-NEXT:    pushq %r12
3749; AVX2-NEXT:  .Ltmp59:
3750; AVX2-NEXT:    .cfi_def_cfa_offset 48
3751; AVX2-NEXT:    pushq %rbx
3752; AVX2-NEXT:  .Ltmp60:
3753; AVX2-NEXT:    .cfi_def_cfa_offset 56
3754; AVX2-NEXT:    subq $136, %rsp
3755; AVX2-NEXT:  .Ltmp61:
3756; AVX2-NEXT:    .cfi_def_cfa_offset 192
3757; AVX2-NEXT:  .Ltmp62:
3758; AVX2-NEXT:    .cfi_offset %rbx, -56
3759; AVX2-NEXT:  .Ltmp63:
3760; AVX2-NEXT:    .cfi_offset %r12, -48
3761; AVX2-NEXT:  .Ltmp64:
3762; AVX2-NEXT:    .cfi_offset %r13, -40
3763; AVX2-NEXT:  .Ltmp65:
3764; AVX2-NEXT:    .cfi_offset %r14, -32
3765; AVX2-NEXT:  .Ltmp66:
3766; AVX2-NEXT:    .cfi_offset %r15, -24
3767; AVX2-NEXT:  .Ltmp67:
3768; AVX2-NEXT:    .cfi_offset %rbp, -16
3769; AVX2-NEXT:    movq %rdi, %rbx
3770; AVX2-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
3771; AVX2-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
3772; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3773; AVX2-NEXT:    vzeroupper
3774; AVX2-NEXT:    callq __truncdfhf2
3775; AVX2-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
3776; AVX2-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3777; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3778; AVX2-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3779; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3780; AVX2-NEXT:    vzeroupper
3781; AVX2-NEXT:    callq __truncdfhf2
3782; AVX2-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
3783; AVX2-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3784; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3785; AVX2-NEXT:    vzeroupper
3786; AVX2-NEXT:    callq __truncdfhf2
3787; AVX2-NEXT:    movl %eax, %r12d
3788; AVX2-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3789; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3790; AVX2-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3791; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3792; AVX2-NEXT:    vzeroupper
3793; AVX2-NEXT:    callq __truncdfhf2
3794; AVX2-NEXT:    movl %eax, %r13d
3795; AVX2-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3796; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3797; AVX2-NEXT:    vzeroupper
3798; AVX2-NEXT:    callq __truncdfhf2
3799; AVX2-NEXT:    movl %eax, %ebp
3800; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3801; AVX2-NEXT:    callq __truncdfhf2
3802; AVX2-NEXT:    movl %eax, %r14d
3803; AVX2-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3804; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3805; AVX2-NEXT:    vzeroupper
3806; AVX2-NEXT:    callq __truncdfhf2
3807; AVX2-NEXT:    movl %eax, %r15d
3808; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3809; AVX2-NEXT:    callq __truncdfhf2
3810; AVX2-NEXT:    movw %ax, 12(%rbx)
3811; AVX2-NEXT:    movw %r15w, 8(%rbx)
3812; AVX2-NEXT:    movw %r14w, 4(%rbx)
3813; AVX2-NEXT:    movw %bp, (%rbx)
3814; AVX2-NEXT:    movw %r13w, 14(%rbx)
3815; AVX2-NEXT:    movw %r12w, 10(%rbx)
3816; AVX2-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
3817; AVX2-NEXT:    movw %ax, 6(%rbx)
3818; AVX2-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
3819; AVX2-NEXT:    movw %ax, 2(%rbx)
3820; AVX2-NEXT:    addq $136, %rsp
3821; AVX2-NEXT:    popq %rbx
3822; AVX2-NEXT:    popq %r12
3823; AVX2-NEXT:    popq %r13
3824; AVX2-NEXT:    popq %r14
3825; AVX2-NEXT:    popq %r15
3826; AVX2-NEXT:    popq %rbp
3827; AVX2-NEXT:    retq
3828;
3829; AVX512-LABEL: store_cvt_8f64_to_8i16:
3830; AVX512:       # BB#0:
3831; AVX512-NEXT:    pushq %rbp
3832; AVX512-NEXT:  .Ltmp55:
3833; AVX512-NEXT:    .cfi_def_cfa_offset 16
3834; AVX512-NEXT:    pushq %r15
3835; AVX512-NEXT:  .Ltmp56:
3836; AVX512-NEXT:    .cfi_def_cfa_offset 24
3837; AVX512-NEXT:    pushq %r14
3838; AVX512-NEXT:  .Ltmp57:
3839; AVX512-NEXT:    .cfi_def_cfa_offset 32
3840; AVX512-NEXT:    pushq %r13
3841; AVX512-NEXT:  .Ltmp58:
3842; AVX512-NEXT:    .cfi_def_cfa_offset 40
3843; AVX512-NEXT:    pushq %r12
3844; AVX512-NEXT:  .Ltmp59:
3845; AVX512-NEXT:    .cfi_def_cfa_offset 48
3846; AVX512-NEXT:    pushq %rbx
3847; AVX512-NEXT:  .Ltmp60:
3848; AVX512-NEXT:    .cfi_def_cfa_offset 56
3849; AVX512-NEXT:    subq $200, %rsp
3850; AVX512-NEXT:  .Ltmp61:
3851; AVX512-NEXT:    .cfi_def_cfa_offset 256
3852; AVX512-NEXT:  .Ltmp62:
3853; AVX512-NEXT:    .cfi_offset %rbx, -56
3854; AVX512-NEXT:  .Ltmp63:
3855; AVX512-NEXT:    .cfi_offset %r12, -48
3856; AVX512-NEXT:  .Ltmp64:
3857; AVX512-NEXT:    .cfi_offset %r13, -40
3858; AVX512-NEXT:  .Ltmp65:
3859; AVX512-NEXT:    .cfi_offset %r14, -32
3860; AVX512-NEXT:  .Ltmp66:
3861; AVX512-NEXT:    .cfi_offset %r15, -24
3862; AVX512-NEXT:  .Ltmp67:
3863; AVX512-NEXT:    .cfi_offset %rbp, -16
3864; AVX512-NEXT:    movq %rdi, %rbx
3865; AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill
3866; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3867; AVX512-NEXT:    callq __truncdfhf2
3868; AVX512-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
3869; AVX512-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
3870; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3871; AVX512-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3872; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3873; AVX512-NEXT:    callq __truncdfhf2
3874; AVX512-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
3875; AVX512-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
3876; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
3877; AVX512-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
3878; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3879; AVX512-NEXT:    callq __truncdfhf2
3880; AVX512-NEXT:    movl %eax, %r12d
3881; AVX512-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3882; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3883; AVX512-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3884; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3885; AVX512-NEXT:    callq __truncdfhf2
3886; AVX512-NEXT:    movl %eax, %r13d
3887; AVX512-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
3888; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
3889; AVX512-NEXT:    callq __truncdfhf2
3890; AVX512-NEXT:    movl %eax, %ebp
3891; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3892; AVX512-NEXT:    callq __truncdfhf2
3893; AVX512-NEXT:    movl %eax, %r14d
3894; AVX512-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3895; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3896; AVX512-NEXT:    callq __truncdfhf2
3897; AVX512-NEXT:    movl %eax, %r15d
3898; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3899; AVX512-NEXT:    callq __truncdfhf2
3900; AVX512-NEXT:    movw %ax, 12(%rbx)
3901; AVX512-NEXT:    movw %r15w, 8(%rbx)
3902; AVX512-NEXT:    movw %r14w, 4(%rbx)
3903; AVX512-NEXT:    movw %bp, (%rbx)
3904; AVX512-NEXT:    movw %r13w, 14(%rbx)
3905; AVX512-NEXT:    movw %r12w, 10(%rbx)
3906; AVX512-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
3907; AVX512-NEXT:    movw %ax, 6(%rbx)
3908; AVX512-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
3909; AVX512-NEXT:    movw %ax, 2(%rbx)
3910; AVX512-NEXT:    addq $200, %rsp
3911; AVX512-NEXT:    popq %rbx
3912; AVX512-NEXT:    popq %r12
3913; AVX512-NEXT:    popq %r13
3914; AVX512-NEXT:    popq %r14
3915; AVX512-NEXT:    popq %r15
3916; AVX512-NEXT:    popq %rbp
3917; AVX512-NEXT:    retq
3918  %1 = fptrunc <8 x double> %a0 to <8 x half>
3919  %2 = bitcast <8 x half> %1 to <8 x i16>
3920  store <8 x i16> %2, <8 x i16>* %a1
3921  ret void
3922}
3923