1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX
3; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX512
4; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=FAST-ymm-zmm
5; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BTVER2
6
7declare i32 @foo()
8declare <4 x float> @do_sse(<4 x float>)
9declare <8 x float> @do_avx(<8 x float>)
10declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
11@x = common global <4 x float> zeroinitializer, align 16
12@g = common global <8 x float> zeroinitializer, align 32
13
14;; Basic checking - don't emit any vzeroupper instruction
15
16define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind {
17; ALL-LABEL: test00:
18; ALL:       # %bb.0:
19; ALL-NEXT:    pushq %rax
20; ALL-NEXT:    vaddps %xmm1, %xmm0, %xmm0
21; ALL-NEXT:    callq do_sse
22; ALL-NEXT:    popq %rax
23; ALL-NEXT:    retq
24  %add.i = fadd <4 x float> %a, %b
25  %call3 = call <4 x float> @do_sse(<4 x float> %add.i) nounwind
26  ret <4 x float> %call3
27}
28
29;; Check parameter 256-bit parameter passing
30
31define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind {
32; VZ-LABEL: test01:
33; VZ:       # %bb.0:
34; VZ-NEXT:    subq $56, %rsp
35; VZ-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
36; VZ-NEXT:    vmovaps {{.*}}(%rip), %xmm0
37; VZ-NEXT:    vzeroupper
38; VZ-NEXT:    callq do_sse
39; VZ-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
40; VZ-NEXT:    callq do_sse
41; VZ-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
42; VZ-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
43; VZ-NEXT:    addq $56, %rsp
44; VZ-NEXT:    retq
45;
46; FAST-ymm-zmm-LABEL: test01:
47; FAST-ymm-zmm:       # %bb.0:
48; FAST-ymm-zmm-NEXT:    subq $56, %rsp
49; FAST-ymm-zmm-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
50; FAST-ymm-zmm-NEXT:    vmovaps {{.*}}(%rip), %xmm0
51; FAST-ymm-zmm-NEXT:    callq do_sse
52; FAST-ymm-zmm-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
53; FAST-ymm-zmm-NEXT:    callq do_sse
54; FAST-ymm-zmm-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
55; FAST-ymm-zmm-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
56; FAST-ymm-zmm-NEXT:    addq $56, %rsp
57; FAST-ymm-zmm-NEXT:    retq
58;
59; BTVER2-LABEL: test01:
60; BTVER2:       # %bb.0:
61; BTVER2-NEXT:    subq $56, %rsp
62; BTVER2-NEXT:    vmovaps {{.*}}(%rip), %xmm0
63; BTVER2-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
64; BTVER2-NEXT:    callq do_sse
65; BTVER2-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
66; BTVER2-NEXT:    callq do_sse
67; BTVER2-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
68; BTVER2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
69; BTVER2-NEXT:    addq $56, %rsp
70; BTVER2-NEXT:    retq
71  %tmp = load <4 x float>, <4 x float>* @x, align 16
72  %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind
73  store <4 x float> %call, <4 x float>* @x, align 16
74  %call2 = tail call <4 x float> @do_sse(<4 x float> %call) nounwind
75  store <4 x float> %call2, <4 x float>* @x, align 16
76  ret <8 x float> %c
77}
78
79;; Check that vzeroupper is emitted for tail calls.
80
81define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind {
82; VZ-LABEL: test02:
83; VZ:       # %bb.0:
84; VZ-NEXT:    vaddps %ymm1, %ymm0, %ymm0
85; VZ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
86; VZ-NEXT:    vzeroupper
87; VZ-NEXT:    jmp do_sse # TAILCALL
88;
89; NO-VZ-LABEL: test02:
90; NO-VZ:       # %bb.0:
91; NO-VZ-NEXT:    vaddps %ymm1, %ymm0, %ymm0
92; NO-VZ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
93; NO-VZ-NEXT:    jmp do_sse # TAILCALL
94  %add.i = fadd <8 x float> %a, %b
95  %add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0)
96  %call3 = tail call <4 x float> @do_sse(<4 x float> %add.low) nounwind
97  ret <4 x float> %call3
98}
99
100;; Test the pass convergence and also that vzeroupper is only issued when necessary,
101;; for this function it should be only once
102
103define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
104; VZ-LABEL: test03:
105; VZ:       # %bb.0: # %entry
106; VZ-NEXT:    pushq %rbx
107; VZ-NEXT:    subq $16, %rsp
108; VZ-NEXT:    vaddps %xmm1, %xmm0, %xmm0
109; VZ-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
110; VZ-NEXT:    .p2align 4, 0x90
111; VZ-NEXT:  .LBB3_1: # %while.cond
112; VZ-NEXT:    # =>This Inner Loop Header: Depth=1
113; VZ-NEXT:    callq foo
114; VZ-NEXT:    testl %eax, %eax
115; VZ-NEXT:    jne .LBB3_1
116; VZ-NEXT:  # %bb.2: # %for.body.preheader
117; VZ-NEXT:    movl $4, %ebx
118; VZ-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
119; VZ-NEXT:    .p2align 4, 0x90
120; VZ-NEXT:  .LBB3_3: # %for.body
121; VZ-NEXT:    # =>This Inner Loop Header: Depth=1
122; VZ-NEXT:    callq do_sse
123; VZ-NEXT:    callq do_sse
124; VZ-NEXT:    vmovaps {{.*}}(%rip), %ymm0
125; VZ-NEXT:    vextractf128 $1, %ymm0, %xmm0
126; VZ-NEXT:    vzeroupper
127; VZ-NEXT:    callq do_sse
128; VZ-NEXT:    decl %ebx
129; VZ-NEXT:    jne .LBB3_3
130; VZ-NEXT:  # %bb.4: # %for.end
131; VZ-NEXT:    addq $16, %rsp
132; VZ-NEXT:    popq %rbx
133; VZ-NEXT:    retq
134;
135; FAST-ymm-zmm-LABEL: test03:
136; FAST-ymm-zmm:       # %bb.0: # %entry
137; FAST-ymm-zmm-NEXT:    pushq %rbx
138; FAST-ymm-zmm-NEXT:    subq $16, %rsp
139; FAST-ymm-zmm-NEXT:    vaddps %xmm1, %xmm0, %xmm0
140; FAST-ymm-zmm-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
141; FAST-ymm-zmm-NEXT:    .p2align 4, 0x90
142; FAST-ymm-zmm-NEXT:  .LBB3_1: # %while.cond
143; FAST-ymm-zmm-NEXT:    # =>This Inner Loop Header: Depth=1
144; FAST-ymm-zmm-NEXT:    callq foo
145; FAST-ymm-zmm-NEXT:    testl %eax, %eax
146; FAST-ymm-zmm-NEXT:    jne .LBB3_1
147; FAST-ymm-zmm-NEXT:  # %bb.2: # %for.body.preheader
148; FAST-ymm-zmm-NEXT:    movl $4, %ebx
149; FAST-ymm-zmm-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
150; FAST-ymm-zmm-NEXT:    .p2align 4, 0x90
151; FAST-ymm-zmm-NEXT:  .LBB3_3: # %for.body
152; FAST-ymm-zmm-NEXT:    # =>This Inner Loop Header: Depth=1
153; FAST-ymm-zmm-NEXT:    callq do_sse
154; FAST-ymm-zmm-NEXT:    callq do_sse
155; FAST-ymm-zmm-NEXT:    vmovaps {{.*}}(%rip), %ymm0
156; FAST-ymm-zmm-NEXT:    vextractf128 $1, %ymm0, %xmm0
157; FAST-ymm-zmm-NEXT:    callq do_sse
158; FAST-ymm-zmm-NEXT:    decl %ebx
159; FAST-ymm-zmm-NEXT:    jne .LBB3_3
160; FAST-ymm-zmm-NEXT:  # %bb.4: # %for.end
161; FAST-ymm-zmm-NEXT:    addq $16, %rsp
162; FAST-ymm-zmm-NEXT:    popq %rbx
163; FAST-ymm-zmm-NEXT:    retq
164;
165; BTVER2-LABEL: test03:
166; BTVER2:       # %bb.0: # %entry
167; BTVER2-NEXT:    pushq %rbx
168; BTVER2-NEXT:    subq $16, %rsp
169; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
170; BTVER2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
171; BTVER2-NEXT:    .p2align 4, 0x90
172; BTVER2-NEXT:  .LBB3_1: # %while.cond
173; BTVER2-NEXT:    # =>This Inner Loop Header: Depth=1
174; BTVER2-NEXT:    callq foo
175; BTVER2-NEXT:    testl %eax, %eax
176; BTVER2-NEXT:    jne .LBB3_1
177; BTVER2-NEXT:  # %bb.2: # %for.body.preheader
178; BTVER2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
179; BTVER2-NEXT:    movl $4, %ebx
180; BTVER2-NEXT:    .p2align 4, 0x90
181; BTVER2-NEXT:  .LBB3_3: # %for.body
182; BTVER2-NEXT:    # =>This Inner Loop Header: Depth=1
183; BTVER2-NEXT:    callq do_sse
184; BTVER2-NEXT:    callq do_sse
185; BTVER2-NEXT:    vmovaps {{.*}}(%rip), %ymm0
186; BTVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0
187; BTVER2-NEXT:    callq do_sse
188; BTVER2-NEXT:    decl %ebx
189; BTVER2-NEXT:    jne .LBB3_3
190; BTVER2-NEXT:  # %bb.4: # %for.end
191; BTVER2-NEXT:    addq $16, %rsp
192; BTVER2-NEXT:    popq %rbx
193; BTVER2-NEXT:    retq
194entry:
195  %add.i = fadd <4 x float> %a, %b
196  br label %while.cond
197
198while.cond:
199  %call = tail call i32 @foo()
200  %tobool = icmp eq i32 %call, 0
201  br i1 %tobool, label %for.body, label %while.cond
202
203for.body:
204  %i.018 = phi i32 [ 0, %while.cond ], [ %1, %for.body ]
205  %c.017 = phi <4 x float> [ %add.i, %while.cond ], [ %call14, %for.body ]
206  %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind
207  %call7 = tail call <4 x float> @do_sse(<4 x float> %call5) nounwind
208  %tmp11 = load <8 x float>, <8 x float>* @g, align 32
209  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %tmp11, i8 1) nounwind
210  %call14 = tail call <4 x float> @do_sse(<4 x float> %0) nounwind
211  %1 = add nsw i32 %i.018, 1
212  %exitcond = icmp eq i32 %1, 4
213  br i1 %exitcond, label %for.end, label %for.body
214
215for.end:
216  ret <4 x float> %call14
217}
218
219;; Check that we also perform vzeroupper when we return from a function.
220
221define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind {
222; VZ-LABEL: test04:
223; VZ:       # %bb.0:
224; VZ-NEXT:    pushq %rax
225; VZ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
226; VZ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
227; VZ-NEXT:    callq do_avx
228; VZ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
229; VZ-NEXT:    popq %rax
230; VZ-NEXT:    vzeroupper
231; VZ-NEXT:    retq
232;
233; NO-VZ-LABEL: test04:
234; NO-VZ:       # %bb.0:
235; NO-VZ-NEXT:    pushq %rax
236; NO-VZ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
237; NO-VZ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
238; NO-VZ-NEXT:    callq do_avx
239; NO-VZ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
240; NO-VZ-NEXT:    popq %rax
241; NO-VZ-NEXT:    retq
242  %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
243  %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind
244  %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
245  ret <4 x float> %shuf2
246}
247
248