1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=knl | FileCheck %s -check-prefix=X32
3; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=skx | FileCheck %s -check-prefix=X32
4; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=knl | FileCheck %s -check-prefix=WIN32
5; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=skx | FileCheck %s -check-prefix=WIN32
6; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=knl | FileCheck %s -check-prefixes=WIN64,WIN64-KNL
7; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=skx | FileCheck %s -check-prefixes=WIN64,WIN64-SKX
8; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -check-prefixes=X64,X64-KNL
9; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s -check-prefixes=X64,X64-SKX
10
11declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
12declare <16 x float> @func_float16(<16 x float>, <16 x float>)
13declare i32 @func_int(i32, i32)
14
15;test calling conventions - input parameters
16define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
17; X32-LABEL: testf16_inp:
18; X32:       ## %bb.0:
19; X32-NEXT:    pushl %ebp
20; X32-NEXT:    movl %esp, %ebp
21; X32-NEXT:    andl $-64, %esp
22; X32-NEXT:    subl $192, %esp
23; X32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
24; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
25; X32-NEXT:    movl %eax, (%esp)
26; X32-NEXT:    calll _func_float16_ptr
27; X32-NEXT:    vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0
28; X32-NEXT:    movl %ebp, %esp
29; X32-NEXT:    popl %ebp
30; X32-NEXT:    retl
31;
32; WIN32-LABEL: testf16_inp:
33; WIN32:       # %bb.0:
34; WIN32-NEXT:    pushl %ebp
35; WIN32-NEXT:    movl %esp, %ebp
36; WIN32-NEXT:    andl $-64, %esp
37; WIN32-NEXT:    subl $128, %esp
38; WIN32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
39; WIN32-NEXT:    movl %esp, %eax
40; WIN32-NEXT:    pushl %eax
41; WIN32-NEXT:    calll _func_float16_ptr
42; WIN32-NEXT:    addl $4, %esp
43; WIN32-NEXT:    vaddps (%esp), %zmm0, %zmm0
44; WIN32-NEXT:    movl %ebp, %esp
45; WIN32-NEXT:    popl %ebp
46; WIN32-NEXT:    retl
47;
48; WIN64-LABEL: testf16_inp:
49; WIN64:       # %bb.0:
50; WIN64-NEXT:    pushq %rbp
51; WIN64-NEXT:    subq $176, %rsp
52; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
53; WIN64-NEXT:    andq $-64, %rsp
54; WIN64-NEXT:    vmovaps (%rcx), %zmm0
55; WIN64-NEXT:    vaddps (%rdx), %zmm0, %zmm0
56; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
57; WIN64-NEXT:    callq func_float16_ptr
58; WIN64-NEXT:    vaddps {{[0-9]+}}(%rsp), %zmm0, %zmm0
59; WIN64-NEXT:    leaq 48(%rbp), %rsp
60; WIN64-NEXT:    popq %rbp
61; WIN64-NEXT:    retq
62;
63; X64-LABEL: testf16_inp:
64; X64:       ## %bb.0:
65; X64-NEXT:    pushq %rbp
66; X64-NEXT:    movq %rsp, %rbp
67; X64-NEXT:    pushq %r13
68; X64-NEXT:    pushq %r12
69; X64-NEXT:    andq $-64, %rsp
70; X64-NEXT:    subq $128, %rsp
71; X64-NEXT:    vaddps %zmm1, %zmm0, %zmm0
72; X64-NEXT:    movq %rsp, %rdi
73; X64-NEXT:    callq _func_float16_ptr
74; X64-NEXT:    vaddps (%rsp), %zmm0, %zmm0
75; X64-NEXT:    leaq -16(%rbp), %rsp
76; X64-NEXT:    popq %r12
77; X64-NEXT:    popq %r13
78; X64-NEXT:    popq %rbp
79; X64-NEXT:    retq
80  %y = alloca <16 x float>, align 64
81  %x = fadd <16 x float> %a, %b
82  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
83  %2 = load <16 x float>, <16 x float>* %y, align 16
84  %3 = fadd <16 x float> %2, %1
85  ret <16 x float> %3
86}
87
88;test calling conventions - preserved registers
89
90define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
91; X32-LABEL: testf16_regs:
92; X32:       ## %bb.0:
93; X32-NEXT:    pushl %ebp
94; X32-NEXT:    movl %esp, %ebp
95; X32-NEXT:    andl $-64, %esp
96; X32-NEXT:    subl $256, %esp ## imm = 0x100
97; X32-NEXT:    vmovaps %zmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill
98; X32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
99; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
100; X32-NEXT:    movl %eax, (%esp)
101; X32-NEXT:    calll _func_float16_ptr
102; X32-NEXT:    vaddps {{[-0-9]+}}(%e{{[sb]}}p), %zmm0, %zmm0 ## 64-byte Folded Reload
103; X32-NEXT:    vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0
104; X32-NEXT:    movl %ebp, %esp
105; X32-NEXT:    popl %ebp
106; X32-NEXT:    retl
107;
108; WIN32-LABEL: testf16_regs:
109; WIN32:       # %bb.0:
110; WIN32-NEXT:    pushl %ebp
111; WIN32-NEXT:    movl %esp, %ebp
112; WIN32-NEXT:    andl $-64, %esp
113; WIN32-NEXT:    subl $192, %esp
114; WIN32-NEXT:    vmovaps %zmm1, (%esp) # 64-byte Spill
115; WIN32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
116; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
117; WIN32-NEXT:    pushl %eax
118; WIN32-NEXT:    calll _func_float16_ptr
119; WIN32-NEXT:    addl $4, %esp
120; WIN32-NEXT:    vaddps (%esp), %zmm0, %zmm0 # 64-byte Folded Reload
121; WIN32-NEXT:    vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0
122; WIN32-NEXT:    movl %ebp, %esp
123; WIN32-NEXT:    popl %ebp
124; WIN32-NEXT:    retl
125;
126; WIN64-LABEL: testf16_regs:
127; WIN64:       # %bb.0:
128; WIN64-NEXT:    pushq %rbp
129; WIN64-NEXT:    subq $176, %rsp
130; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
131; WIN64-NEXT:    andq $-64, %rsp
132; WIN64-NEXT:    vmovaps (%rdx), %zmm16
133; WIN64-NEXT:    vaddps (%rcx), %zmm16, %zmm0
134; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
135; WIN64-NEXT:    callq func_float16_ptr
136; WIN64-NEXT:    vaddps %zmm16, %zmm0, %zmm0
137; WIN64-NEXT:    vaddps {{[0-9]+}}(%rsp), %zmm0, %zmm0
138; WIN64-NEXT:    leaq 48(%rbp), %rsp
139; WIN64-NEXT:    popq %rbp
140; WIN64-NEXT:    retq
141;
142; X64-LABEL: testf16_regs:
143; X64:       ## %bb.0:
144; X64-NEXT:    pushq %rbp
145; X64-NEXT:    movq %rsp, %rbp
146; X64-NEXT:    pushq %r13
147; X64-NEXT:    pushq %r12
148; X64-NEXT:    andq $-64, %rsp
149; X64-NEXT:    subq $128, %rsp
150; X64-NEXT:    vmovaps %zmm1, %zmm16
151; X64-NEXT:    vaddps %zmm1, %zmm0, %zmm0
152; X64-NEXT:    movq %rsp, %rdi
153; X64-NEXT:    callq _func_float16_ptr
154; X64-NEXT:    vaddps %zmm16, %zmm0, %zmm0
155; X64-NEXT:    vaddps (%rsp), %zmm0, %zmm0
156; X64-NEXT:    leaq -16(%rbp), %rsp
157; X64-NEXT:    popq %r12
158; X64-NEXT:    popq %r13
159; X64-NEXT:    popq %rbp
160; X64-NEXT:    retq
161  %y = alloca <16 x float>, align 64
162  %x = fadd <16 x float> %a, %b
163  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
164  %2 = load <16 x float>, <16 x float>* %y, align 16
165  %3 = fadd <16 x float> %1, %b
166  %4 = fadd <16 x float> %2, %3
167  ret <16 x float> %4
168}
169
170; test calling conventions - prolog and epilog
171define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
172; X32-LABEL: test_prolog_epilog:
173; X32:       ## %bb.0:
174; X32-NEXT:    subl $12, %esp
175; X32-NEXT:    calll _func_float16
176; X32-NEXT:    addl $12, %esp
177; X32-NEXT:    retl
178;
179; WIN32-LABEL: test_prolog_epilog:
180; WIN32:       # %bb.0:
181; WIN32-NEXT:    calll _func_float16
182; WIN32-NEXT:    retl
183;
184; WIN64-KNL-LABEL: test_prolog_epilog:
185; WIN64-KNL:       # %bb.0:
186; WIN64-KNL-NEXT:    pushq %rbp
187; WIN64-KNL-NEXT:    subq $1264, %rsp # imm = 0x4F0
188; WIN64-KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
189; WIN64-KNL-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
190; WIN64-KNL-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
191; WIN64-KNL-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
192; WIN64-KNL-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
193; WIN64-KNL-NEXT:    vmovaps %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
194; WIN64-KNL-NEXT:    vmovaps %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
195; WIN64-KNL-NEXT:    vmovaps %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
196; WIN64-KNL-NEXT:    vmovaps %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
197; WIN64-KNL-NEXT:    vmovaps %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
198; WIN64-KNL-NEXT:    vmovaps %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
199; WIN64-KNL-NEXT:    vmovaps %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
200; WIN64-KNL-NEXT:    vmovaps %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
201; WIN64-KNL-NEXT:    vmovaps %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
202; WIN64-KNL-NEXT:    vmovaps %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
203; WIN64-KNL-NEXT:    vmovaps %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
204; WIN64-KNL-NEXT:    vmovaps %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
205; WIN64-KNL-NEXT:    vmovaps %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
206; WIN64-KNL-NEXT:    vmovaps %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
207; WIN64-KNL-NEXT:    vmovaps %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
208; WIN64-KNL-NEXT:    vmovaps %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
209; WIN64-KNL-NEXT:    andq $-64, %rsp
210; WIN64-KNL-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%rsp)
211; WIN64-KNL-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
212; WIN64-KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
213; WIN64-KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
214; WIN64-KNL-NEXT:    callq func_float16
215; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
216; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
217; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
218; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
219; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
220; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
221; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
222; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
223; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
224; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
225; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
226; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
227; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
228; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
229; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
230; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
231; WIN64-KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
232; WIN64-KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
233; WIN64-KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
234; WIN64-KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
235; WIN64-KNL-NEXT:    leaq 1136(%rbp), %rsp
236; WIN64-KNL-NEXT:    popq %rbp
237; WIN64-KNL-NEXT:    retq
238;
239; WIN64-SKX-LABEL: test_prolog_epilog:
240; WIN64-SKX:       # %bb.0:
241; WIN64-SKX-NEXT:    pushq %rbp
242; WIN64-SKX-NEXT:    subq $1264, %rsp # imm = 0x4F0
243; WIN64-SKX-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
244; WIN64-SKX-NEXT:    kmovq %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
245; WIN64-SKX-NEXT:    kmovq %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
246; WIN64-SKX-NEXT:    kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
247; WIN64-SKX-NEXT:    kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
248; WIN64-SKX-NEXT:    vmovaps %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
249; WIN64-SKX-NEXT:    vmovaps %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
250; WIN64-SKX-NEXT:    vmovaps %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
251; WIN64-SKX-NEXT:    vmovaps %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
252; WIN64-SKX-NEXT:    vmovaps %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
253; WIN64-SKX-NEXT:    vmovaps %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
254; WIN64-SKX-NEXT:    vmovaps %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
255; WIN64-SKX-NEXT:    vmovaps %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
256; WIN64-SKX-NEXT:    vmovaps %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
257; WIN64-SKX-NEXT:    vmovaps %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
258; WIN64-SKX-NEXT:    vmovaps %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
259; WIN64-SKX-NEXT:    vmovaps %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
260; WIN64-SKX-NEXT:    vmovaps %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
261; WIN64-SKX-NEXT:    vmovaps %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
262; WIN64-SKX-NEXT:    vmovaps %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
263; WIN64-SKX-NEXT:    vmovaps %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
264; WIN64-SKX-NEXT:    andq $-64, %rsp
265; WIN64-SKX-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%rsp)
266; WIN64-SKX-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
267; WIN64-SKX-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
268; WIN64-SKX-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
269; WIN64-SKX-NEXT:    callq func_float16
270; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
271; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
272; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
273; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
274; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
275; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
276; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
277; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
278; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
279; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
280; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
281; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
282; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
283; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
284; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
285; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
286; WIN64-SKX-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload
287; WIN64-SKX-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
288; WIN64-SKX-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 8-byte Reload
289; WIN64-SKX-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
290; WIN64-SKX-NEXT:    leaq 1136(%rbp), %rsp
291; WIN64-SKX-NEXT:    popq %rbp
292; WIN64-SKX-NEXT:    retq
293;
294; X64-KNL-LABEL: test_prolog_epilog:
295; X64-KNL:       ## %bb.0:
296; X64-KNL-NEXT:    pushq %rsi
297; X64-KNL-NEXT:    subq $1072, %rsp ## imm = 0x430
298; X64-KNL-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
299; X64-KNL-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
300; X64-KNL-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
301; X64-KNL-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
302; X64-KNL-NEXT:    vmovups %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
303; X64-KNL-NEXT:    vmovups %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
304; X64-KNL-NEXT:    vmovups %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
305; X64-KNL-NEXT:    vmovups %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
306; X64-KNL-NEXT:    vmovups %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
307; X64-KNL-NEXT:    vmovups %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
308; X64-KNL-NEXT:    vmovups %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
309; X64-KNL-NEXT:    vmovups %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
310; X64-KNL-NEXT:    vmovups %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
311; X64-KNL-NEXT:    vmovups %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
312; X64-KNL-NEXT:    vmovups %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
313; X64-KNL-NEXT:    vmovups %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
314; X64-KNL-NEXT:    vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
315; X64-KNL-NEXT:    vmovups %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
316; X64-KNL-NEXT:    vmovups %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
317; X64-KNL-NEXT:    vmovups %zmm16, (%rsp) ## 64-byte Spill
318; X64-KNL-NEXT:    callq _func_float16
319; X64-KNL-NEXT:    vmovups (%rsp), %zmm16 ## 64-byte Reload
320; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload
321; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload
322; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload
323; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload
324; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload
325; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload
326; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload
327; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload
328; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload
329; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload
330; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload
331; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload
332; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload
333; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload
334; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload
335; X64-KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
336; X64-KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
337; X64-KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
338; X64-KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
339; X64-KNL-NEXT:    addq $1072, %rsp ## imm = 0x430
340; X64-KNL-NEXT:    popq %rsi
341; X64-KNL-NEXT:    retq
342;
343; X64-SKX-LABEL: test_prolog_epilog:
344; X64-SKX:       ## %bb.0:
345; X64-SKX-NEXT:    pushq %rsi
346; X64-SKX-NEXT:    subq $1072, %rsp ## imm = 0x430
347; X64-SKX-NEXT:    kmovq %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
348; X64-SKX-NEXT:    kmovq %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
349; X64-SKX-NEXT:    kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
350; X64-SKX-NEXT:    kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
351; X64-SKX-NEXT:    vmovups %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
352; X64-SKX-NEXT:    vmovups %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
353; X64-SKX-NEXT:    vmovups %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
354; X64-SKX-NEXT:    vmovups %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
355; X64-SKX-NEXT:    vmovups %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
356; X64-SKX-NEXT:    vmovups %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
357; X64-SKX-NEXT:    vmovups %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
358; X64-SKX-NEXT:    vmovups %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
359; X64-SKX-NEXT:    vmovups %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
360; X64-SKX-NEXT:    vmovups %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
361; X64-SKX-NEXT:    vmovups %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
362; X64-SKX-NEXT:    vmovups %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
363; X64-SKX-NEXT:    vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
364; X64-SKX-NEXT:    vmovups %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
365; X64-SKX-NEXT:    vmovups %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
366; X64-SKX-NEXT:    vmovups %zmm16, (%rsp) ## 64-byte Spill
367; X64-SKX-NEXT:    callq _func_float16
368; X64-SKX-NEXT:    vmovups (%rsp), %zmm16 ## 64-byte Reload
369; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload
370; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload
371; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload
372; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload
373; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload
374; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload
375; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload
376; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload
377; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload
378; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload
379; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload
380; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload
381; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload
382; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload
383; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload
384; X64-SKX-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 8-byte Reload
385; X64-SKX-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 8-byte Reload
386; X64-SKX-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 8-byte Reload
387; X64-SKX-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 8-byte Reload
388; X64-SKX-NEXT:    addq $1072, %rsp ## imm = 0x430
389; X64-SKX-NEXT:    popq %rsi
390; X64-SKX-NEXT:    retq
391   %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
392   ret <16 x float> %c
393}
394
395
396declare <16 x float> @func_float16_mask(<16 x float>, <16 x i1>)
397
398define <16 x float> @testf16_inp_mask(<16 x float> %a, i16 %mask)  {
399; X32-LABEL: testf16_inp_mask:
400; X32:       ## %bb.0:
401; X32-NEXT:    subl $12, %esp
402; X32-NEXT:    .cfi_def_cfa_offset 16
403; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
404; X32-NEXT:    calll _func_float16_mask
405; X32-NEXT:    addl $12, %esp
406; X32-NEXT:    retl
407;
408; WIN32-LABEL: testf16_inp_mask:
409; WIN32:       # %bb.0:
410; WIN32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
411; WIN32-NEXT:    calll _func_float16_mask
412; WIN32-NEXT:    retl
413;
414; WIN64-KNL-LABEL: testf16_inp_mask:
415; WIN64-KNL:       # %bb.0:
416; WIN64-KNL-NEXT:    subq $40, %rsp
417; WIN64-KNL-NEXT:    .seh_stackalloc 40
418; WIN64-KNL-NEXT:    .seh_endprologue
419; WIN64-KNL-NEXT:    # kill: def $dx killed $dx def $edx
420; WIN64-KNL-NEXT:    vmovaps (%rcx), %zmm0
421; WIN64-KNL-NEXT:    kmovw %edx, %k1
422; WIN64-KNL-NEXT:    callq func_float16_mask
423; WIN64-KNL-NEXT:    nop
424; WIN64-KNL-NEXT:    addq $40, %rsp
425; WIN64-KNL-NEXT:    retq
426; WIN64-KNL-NEXT:    .seh_endproc
427;
428; WIN64-SKX-LABEL: testf16_inp_mask:
429; WIN64-SKX:       # %bb.0:
430; WIN64-SKX-NEXT:    subq $40, %rsp
431; WIN64-SKX-NEXT:    .seh_stackalloc 40
432; WIN64-SKX-NEXT:    .seh_endprologue
433; WIN64-SKX-NEXT:    # kill: def $dx killed $dx def $edx
434; WIN64-SKX-NEXT:    vmovaps (%rcx), %zmm0
435; WIN64-SKX-NEXT:    kmovd %edx, %k1
436; WIN64-SKX-NEXT:    callq func_float16_mask
437; WIN64-SKX-NEXT:    nop
438; WIN64-SKX-NEXT:    addq $40, %rsp
439; WIN64-SKX-NEXT:    retq
440; WIN64-SKX-NEXT:    .seh_endproc
441;
442; X64-KNL-LABEL: testf16_inp_mask:
443; X64-KNL:       ## %bb.0:
444; X64-KNL-NEXT:    pushq %rbp
445; X64-KNL-NEXT:    .cfi_def_cfa_offset 16
446; X64-KNL-NEXT:    pushq %r13
447; X64-KNL-NEXT:    .cfi_def_cfa_offset 24
448; X64-KNL-NEXT:    pushq %r12
449; X64-KNL-NEXT:    .cfi_def_cfa_offset 32
450; X64-KNL-NEXT:    .cfi_offset %r12, -32
451; X64-KNL-NEXT:    .cfi_offset %r13, -24
452; X64-KNL-NEXT:    .cfi_offset %rbp, -16
453; X64-KNL-NEXT:    kmovw %edi, %k1
454; X64-KNL-NEXT:    callq _func_float16_mask
455; X64-KNL-NEXT:    popq %r12
456; X64-KNL-NEXT:    popq %r13
457; X64-KNL-NEXT:    popq %rbp
458; X64-KNL-NEXT:    retq
459;
460; X64-SKX-LABEL: testf16_inp_mask:
461; X64-SKX:       ## %bb.0:
462; X64-SKX-NEXT:    pushq %rbp
463; X64-SKX-NEXT:    .cfi_def_cfa_offset 16
464; X64-SKX-NEXT:    pushq %r13
465; X64-SKX-NEXT:    .cfi_def_cfa_offset 24
466; X64-SKX-NEXT:    pushq %r12
467; X64-SKX-NEXT:    .cfi_def_cfa_offset 32
468; X64-SKX-NEXT:    .cfi_offset %r12, -32
469; X64-SKX-NEXT:    .cfi_offset %r13, -24
470; X64-SKX-NEXT:    .cfi_offset %rbp, -16
471; X64-SKX-NEXT:    kmovd %edi, %k1
472; X64-SKX-NEXT:    callq _func_float16_mask
473; X64-SKX-NEXT:    popq %r12
474; X64-SKX-NEXT:    popq %r13
475; X64-SKX-NEXT:    popq %rbp
476; X64-SKX-NEXT:    retq
477  %imask = bitcast i16 %mask to <16 x i1>
478  %1 = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1> %imask)
479  ret <16 x float> %1
480}
481
482define intel_ocl_bicc <16 x float> @test_prolog_epilog_with_mask(<16 x float> %a, <16 x i32> %x1, <16 x i32>%x2, <16 x i1> %mask) nounwind {
483; X32-LABEL: test_prolog_epilog_with_mask:
484; X32:       ## %bb.0:
485; X32-NEXT:    subl $12, %esp
486; X32-NEXT:    vpcmpeqd %zmm2, %zmm1, %k0
487; X32-NEXT:    kxorw %k1, %k0, %k1
488; X32-NEXT:    calll _func_float16_mask
489; X32-NEXT:    addl $12, %esp
490; X32-NEXT:    retl
491;
492; WIN32-LABEL: test_prolog_epilog_with_mask:
493; WIN32:       # %bb.0:
494; WIN32-NEXT:    vpcmpeqd %zmm2, %zmm1, %k0
495; WIN32-NEXT:    kxorw %k1, %k0, %k1
496; WIN32-NEXT:    calll _func_float16_mask
497; WIN32-NEXT:    retl
498;
499; WIN64-LABEL: test_prolog_epilog_with_mask:
500; WIN64:       # %bb.0:
501; WIN64-NEXT:    subq $40, %rsp
502; WIN64-NEXT:    vpcmpeqd %zmm2, %zmm1, %k0
503; WIN64-NEXT:    kxorw %k1, %k0, %k1
504; WIN64-NEXT:    callq func_float16_mask
505; WIN64-NEXT:    addq $40, %rsp
506; WIN64-NEXT:    retq
507;
508; X64-LABEL: test_prolog_epilog_with_mask:
509; X64:       ## %bb.0:
510; X64-NEXT:    pushq %rax
511; X64-NEXT:    vpcmpeqd %zmm2, %zmm1, %k0
512; X64-NEXT:    kxorw %k1, %k0, %k1
513; X64-NEXT:    callq _func_float16_mask
514; X64-NEXT:    popq %rax
515; X64-NEXT:    retq
516   %cmp_res = icmp eq <16 x i32>%x1, %x2
517   %mask1 = xor <16 x i1> %cmp_res, %mask
518   %c = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1>%mask1)
519   ret <16 x float> %c
520}
521