1; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s 2; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx,+fast-partial-ymm-write | FileCheck --check-prefix=FASTYMM %s 3; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck --check-prefix=BTVER2 %s 4 5; FASTYMM-NOT: vzeroupper 6; BTVER2-NOT: vzeroupper 7 8declare i32 @foo() 9declare <4 x float> @do_sse(<4 x float>) 10declare <8 x float> @do_avx(<8 x float>) 11declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone 12@x = common global <4 x float> zeroinitializer, align 16 13@g = common global <8 x float> zeroinitializer, align 32 14 15;; Basic checking - don't emit any vzeroupper instruction 16 17; CHECK: _test00 18define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp { 19entry: 20 ; CHECK-NOT: vzeroupper 21 %add.i = fadd <4 x float> %a, %b 22 %call3 = call <4 x float> @do_sse(<4 x float> %add.i) nounwind 23 ; CHECK: ret 24 ret <4 x float> %call3 25} 26 27;; Check parameter 256-bit parameter passing 28 29; CHECK: _test01 30define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind uwtable ssp { 31entry: 32 %tmp = load <4 x float>, <4 x float>* @x, align 16 33 ; CHECK: vzeroupper 34 ; CHECK-NEXT: callq _do_sse 35 %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind 36 store <4 x float> %call, <4 x float>* @x, align 16 37 ; CHECK-NOT: vzeroupper 38 ; CHECK: callq _do_sse 39 %call2 = tail call <4 x float> @do_sse(<4 x float> %call) nounwind 40 store <4 x float> %call2, <4 x float>* @x, align 16 41 ; CHECK: ret 42 ret <8 x float> %c 43} 44 45;; Check that vzeroupper is emitted for tail calls. 46 47; CHECK: _test02 48define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind uwtable ssp { 49entry: 50 %add.i = fadd <8 x float> %a, %b 51 %add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0) 52 ; CHECK: vzeroupper 53 ; CHECK: jmp _do_sse 54 %call3 = tail call <4 x float> @do_sse(<4 x float> %add.low) nounwind 55 ret <4 x float> %call3 56} 57 58;; Test the pass convergence and also that vzeroupper is only issued when necessary, 59;; for this function it should be only once 60 61; CHECK: _test03 62define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp { 63entry: 64 %add.i = fadd <4 x float> %a, %b 65 br label %while.cond 66 67while.cond: 68 %call = tail call i32 @foo() 69 %tobool = icmp eq i32 %call, 0 70 br i1 %tobool, label %for.body, label %while.cond 71 72for.body: 73 ; CHECK: LBB 74 ; CHECK-NOT: vzeroupper 75 %i.018 = phi i32 [ 0, %while.cond ], [ %1, %for.body ] 76 %c.017 = phi <4 x float> [ %add.i, %while.cond ], [ %call14, %for.body ] 77 ; CHECK: callq _do_sse 78 %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind 79 ; CHECK-NEXT: callq _do_sse 80 %call7 = tail call <4 x float> @do_sse(<4 x float> %call5) nounwind 81 %tmp11 = load <8 x float>, <8 x float>* @g, align 32 82 %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %tmp11, i8 1) nounwind 83 ; CHECK: vzeroupper 84 ; CHECK-NEXT: callq _do_sse 85 %call14 = tail call <4 x float> @do_sse(<4 x float> %0) nounwind 86 %1 = add nsw i32 %i.018, 1 87 %exitcond = icmp eq i32 %1, 4 88 br i1 %exitcond, label %for.end, label %for.body 89 90for.end: 91 ret <4 x float> %call14 92} 93 94;; Check that we also perform vzeroupper when we return from a function. 95 96; CHECK: _test04 97define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp { 98entry: 99 %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 100 ; CHECK-NOT: vzeroupper 101 ; CHECK: call 102 %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind 103 %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 104 ; CHECK: vzeroupper 105 ; CHECK: ret 106 ret <4 x float> %shuf2 107} 108