1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX 3; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX512 4; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=FAST-ymm-zmm 5; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BTVER2 6 7declare i32 @foo() 8declare <4 x float> @do_sse(<4 x float>) 9declare <8 x float> @do_avx(<8 x float>) 10declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone 11@x = common global <4 x float> zeroinitializer, align 16 12@g = common global <8 x float> zeroinitializer, align 32 13 14;; Basic checking - don't emit any vzeroupper instruction 15 16define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind { 17; ALL-LABEL: test00: 18; ALL: # %bb.0: 19; ALL-NEXT: pushq %rax 20; ALL-NEXT: vaddps %xmm1, %xmm0, %xmm0 21; ALL-NEXT: callq do_sse 22; ALL-NEXT: popq %rax 23; ALL-NEXT: retq 24 %add.i = fadd <4 x float> %a, %b 25 %call3 = call <4 x float> @do_sse(<4 x float> %add.i) nounwind 26 ret <4 x float> %call3 27} 28 29;; Check parameter 256-bit parameter passing 30 31define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind { 32; VZ-LABEL: test01: 33; VZ: # %bb.0: 34; VZ-NEXT: subq $56, %rsp 35; VZ-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill 36; VZ-NEXT: vmovaps {{.*}}(%rip), %xmm0 37; VZ-NEXT: vzeroupper 38; VZ-NEXT: callq do_sse 39; VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) 40; VZ-NEXT: callq do_sse 41; VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) 42; VZ-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 43; VZ-NEXT: addq $56, %rsp 44; VZ-NEXT: retq 45; 46; FAST-ymm-zmm-LABEL: test01: 47; FAST-ymm-zmm: # %bb.0: 48; FAST-ymm-zmm-NEXT: subq $56, %rsp 49; FAST-ymm-zmm-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill 50; FAST-ymm-zmm-NEXT: vmovaps {{.*}}(%rip), %xmm0 51; FAST-ymm-zmm-NEXT: callq do_sse 52; FAST-ymm-zmm-NEXT: vmovaps %xmm0, {{.*}}(%rip) 53; FAST-ymm-zmm-NEXT: callq do_sse 54; FAST-ymm-zmm-NEXT: vmovaps %xmm0, {{.*}}(%rip) 55; FAST-ymm-zmm-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 56; FAST-ymm-zmm-NEXT: addq $56, %rsp 57; FAST-ymm-zmm-NEXT: retq 58; 59; BTVER2-LABEL: test01: 60; BTVER2: # %bb.0: 61; BTVER2-NEXT: subq $56, %rsp 62; BTVER2-NEXT: vmovaps {{.*}}(%rip), %xmm0 63; BTVER2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill 64; BTVER2-NEXT: callq do_sse 65; BTVER2-NEXT: vmovaps %xmm0, {{.*}}(%rip) 66; BTVER2-NEXT: callq do_sse 67; BTVER2-NEXT: vmovaps %xmm0, {{.*}}(%rip) 68; BTVER2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 69; BTVER2-NEXT: addq $56, %rsp 70; BTVER2-NEXT: retq 71 %tmp = load <4 x float>, <4 x float>* @x, align 16 72 %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind 73 store <4 x float> %call, <4 x float>* @x, align 16 74 %call2 = tail call <4 x float> @do_sse(<4 x float> %call) nounwind 75 store <4 x float> %call2, <4 x float>* @x, align 16 76 ret <8 x float> %c 77} 78 79;; Check that vzeroupper is emitted for tail calls. 80 81define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind { 82; VZ-LABEL: test02: 83; VZ: # %bb.0: 84; VZ-NEXT: vaddps %ymm1, %ymm0, %ymm0 85; VZ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 86; VZ-NEXT: vzeroupper 87; VZ-NEXT: jmp do_sse # TAILCALL 88; 89; NO-VZ-LABEL: test02: 90; NO-VZ: # %bb.0: 91; NO-VZ-NEXT: vaddps %ymm1, %ymm0, %ymm0 92; NO-VZ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 93; NO-VZ-NEXT: jmp do_sse # TAILCALL 94 %add.i = fadd <8 x float> %a, %b 95 %add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0) 96 %call3 = tail call <4 x float> @do_sse(<4 x float> %add.low) nounwind 97 ret <4 x float> %call3 98} 99 100;; Test the pass convergence and also that vzeroupper is only issued when necessary, 101;; for this function it should be only once 102 103define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind { 104; VZ-LABEL: test03: 105; VZ: # %bb.0: # %entry 106; VZ-NEXT: pushq %rbx 107; VZ-NEXT: subq $16, %rsp 108; VZ-NEXT: vaddps %xmm1, %xmm0, %xmm0 109; VZ-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 110; VZ-NEXT: .p2align 4, 0x90 111; VZ-NEXT: .LBB3_1: # %while.cond 112; VZ-NEXT: # =>This Inner Loop Header: Depth=1 113; VZ-NEXT: callq foo 114; VZ-NEXT: testl %eax, %eax 115; VZ-NEXT: jne .LBB3_1 116; VZ-NEXT: # %bb.2: # %for.body.preheader 117; VZ-NEXT: movl $4, %ebx 118; VZ-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 119; VZ-NEXT: .p2align 4, 0x90 120; VZ-NEXT: .LBB3_3: # %for.body 121; VZ-NEXT: # =>This Inner Loop Header: Depth=1 122; VZ-NEXT: callq do_sse 123; VZ-NEXT: callq do_sse 124; VZ-NEXT: vmovaps {{.*}}(%rip), %ymm0 125; VZ-NEXT: vextractf128 $1, %ymm0, %xmm0 126; VZ-NEXT: vzeroupper 127; VZ-NEXT: callq do_sse 128; VZ-NEXT: decl %ebx 129; VZ-NEXT: jne .LBB3_3 130; VZ-NEXT: # %bb.4: # %for.end 131; VZ-NEXT: addq $16, %rsp 132; VZ-NEXT: popq %rbx 133; VZ-NEXT: retq 134; 135; FAST-ymm-zmm-LABEL: test03: 136; FAST-ymm-zmm: # %bb.0: # %entry 137; FAST-ymm-zmm-NEXT: pushq %rbx 138; FAST-ymm-zmm-NEXT: subq $16, %rsp 139; FAST-ymm-zmm-NEXT: vaddps %xmm1, %xmm0, %xmm0 140; FAST-ymm-zmm-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 141; FAST-ymm-zmm-NEXT: .p2align 4, 0x90 142; FAST-ymm-zmm-NEXT: .LBB3_1: # %while.cond 143; FAST-ymm-zmm-NEXT: # =>This Inner Loop Header: Depth=1 144; FAST-ymm-zmm-NEXT: callq foo 145; FAST-ymm-zmm-NEXT: testl %eax, %eax 146; FAST-ymm-zmm-NEXT: jne .LBB3_1 147; FAST-ymm-zmm-NEXT: # %bb.2: # %for.body.preheader 148; FAST-ymm-zmm-NEXT: movl $4, %ebx 149; FAST-ymm-zmm-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 150; FAST-ymm-zmm-NEXT: .p2align 4, 0x90 151; FAST-ymm-zmm-NEXT: .LBB3_3: # %for.body 152; FAST-ymm-zmm-NEXT: # =>This Inner Loop Header: Depth=1 153; FAST-ymm-zmm-NEXT: callq do_sse 154; FAST-ymm-zmm-NEXT: callq do_sse 155; FAST-ymm-zmm-NEXT: vmovaps {{.*}}(%rip), %ymm0 156; FAST-ymm-zmm-NEXT: vextractf128 $1, %ymm0, %xmm0 157; FAST-ymm-zmm-NEXT: callq do_sse 158; FAST-ymm-zmm-NEXT: decl %ebx 159; FAST-ymm-zmm-NEXT: jne .LBB3_3 160; FAST-ymm-zmm-NEXT: # %bb.4: # %for.end 161; FAST-ymm-zmm-NEXT: addq $16, %rsp 162; FAST-ymm-zmm-NEXT: popq %rbx 163; FAST-ymm-zmm-NEXT: retq 164; 165; BTVER2-LABEL: test03: 166; BTVER2: # %bb.0: # %entry 167; BTVER2-NEXT: pushq %rbx 168; BTVER2-NEXT: subq $16, %rsp 169; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 170; BTVER2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 171; BTVER2-NEXT: .p2align 4, 0x90 172; BTVER2-NEXT: .LBB3_1: # %while.cond 173; BTVER2-NEXT: # =>This Inner Loop Header: Depth=1 174; BTVER2-NEXT: callq foo 175; BTVER2-NEXT: testl %eax, %eax 176; BTVER2-NEXT: jne .LBB3_1 177; BTVER2-NEXT: # %bb.2: # %for.body.preheader 178; BTVER2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 179; BTVER2-NEXT: movl $4, %ebx 180; BTVER2-NEXT: .p2align 4, 0x90 181; BTVER2-NEXT: .LBB3_3: # %for.body 182; BTVER2-NEXT: # =>This Inner Loop Header: Depth=1 183; BTVER2-NEXT: callq do_sse 184; BTVER2-NEXT: callq do_sse 185; BTVER2-NEXT: vmovaps {{.*}}(%rip), %ymm0 186; BTVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 187; BTVER2-NEXT: callq do_sse 188; BTVER2-NEXT: decl %ebx 189; BTVER2-NEXT: jne .LBB3_3 190; BTVER2-NEXT: # %bb.4: # %for.end 191; BTVER2-NEXT: addq $16, %rsp 192; BTVER2-NEXT: popq %rbx 193; BTVER2-NEXT: retq 194entry: 195 %add.i = fadd <4 x float> %a, %b 196 br label %while.cond 197 198while.cond: 199 %call = tail call i32 @foo() 200 %tobool = icmp eq i32 %call, 0 201 br i1 %tobool, label %for.body, label %while.cond 202 203for.body: 204 %i.018 = phi i32 [ 0, %while.cond ], [ %1, %for.body ] 205 %c.017 = phi <4 x float> [ %add.i, %while.cond ], [ %call14, %for.body ] 206 %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind 207 %call7 = tail call <4 x float> @do_sse(<4 x float> %call5) nounwind 208 %tmp11 = load <8 x float>, <8 x float>* @g, align 32 209 %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %tmp11, i8 1) nounwind 210 %call14 = tail call <4 x float> @do_sse(<4 x float> %0) nounwind 211 %1 = add nsw i32 %i.018, 1 212 %exitcond = icmp eq i32 %1, 4 213 br i1 %exitcond, label %for.end, label %for.body 214 215for.end: 216 ret <4 x float> %call14 217} 218 219;; Check that we also perform vzeroupper when we return from a function. 220 221define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind { 222; VZ-LABEL: test04: 223; VZ: # %bb.0: 224; VZ-NEXT: pushq %rax 225; VZ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 226; VZ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 227; VZ-NEXT: callq do_avx 228; VZ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 229; VZ-NEXT: popq %rax 230; VZ-NEXT: vzeroupper 231; VZ-NEXT: retq 232; 233; NO-VZ-LABEL: test04: 234; NO-VZ: # %bb.0: 235; NO-VZ-NEXT: pushq %rax 236; NO-VZ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 237; NO-VZ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 238; NO-VZ-NEXT: callq do_avx 239; NO-VZ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 240; NO-VZ-NEXT: popq %rax 241; NO-VZ-NEXT: retq 242 %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 243 %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind 244 %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 245 ret <4 x float> %shuf2 246} 247 248