1; RUN: llc < %s -mtriple=x86_64-linux -mattr=+64bit,+sse3 -print-failed-fuse-candidates -regalloc=basic |& FileCheck %s 2; CHECK: fail 3; CHECK-NOT: fail 4 5declare float @test_f(float %f) 6declare double @test_d(double %f) 7declare <4 x float> @test_vf(<4 x float> %f) 8declare <2 x double> @test_vd(<2 x double> %f) 9declare float @llvm.sqrt.f32(float) 10declare double @llvm.sqrt.f64(double) 11 12declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) 13declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) 14declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) 15declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) 16declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) 17declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) 18declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) 19declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) 20declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) 21declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) 22declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) 23declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) 24declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) 25declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) 26declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) 27declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) 28 29define float @foo(float %f) { 30 %a = call float @test_f(float %f) 31 %t = call float @llvm.sqrt.f32(float %f) 32 ret float %t 33} 34define double @doo(double %f) { 35 %a = call double @test_d(double %f) 36 %t = call double @llvm.sqrt.f64(double %f) 37 ret double %t 38} 39define <4 x float> @a0(<4 x float> %f) { 40 %a = call <4 x float> @test_vf(<4 x float> %f) 41 %t = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %f) 42 ret <4 x float> %t 43} 44define <4 x float> @a1(<4 x float> %f) { 45 %a = call <4 x float> @test_vf(<4 x float> %f) 46 %t = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %f) 47 ret <4 x float> %t 48} 49define <4 x float> @a2(<4 x float> %f) { 50 %a = call <4 x float> @test_vf(<4 x float> %f) 51 %t = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %f) 52 ret <4 x float> %t 53} 54define <4 x float> @b3(<4 x float> %f) { 55 %y = call <4 x float> @test_vf(<4 x float> %f) 56 %t = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %y, <4 x float> %f) 57 ret <4 x float> %t 58} 59define <4 x float> @b4(<4 x float> %f) { 60 %y = call <4 x float> @test_vf(<4 x float> %f) 61 %t = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %y, <4 x float> %f) 62 ret <4 x float> %t 63} 64define <4 x float> @b5(<4 x float> %f) { 65 %y = call <4 x float> @test_vf(<4 x float> %f) 66 %t = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %y, <4 x float> %f, i8 7) 67 ret <4 x float> %t 68} 69define <4 x float> @b6(<4 x float> %f) { 70 %y = call <4 x float> @test_vf(<4 x float> %f) 71 %t = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %y, <4 x float> %f) 72 ret <4 x float> %t 73} 74define <4 x float> @b7(<4 x float> %f) { 75 %y = call <4 x float> @test_vf(<4 x float> %f) 76 %t = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %y, <4 x float> %f) 77 ret <4 x float> %t 78} 79define <4 x float> @b8(<4 x float> %f) { 80 %y = call <4 x float> @test_vf(<4 x float> %f) 81 %t = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %y, <4 x float> %f) 82 ret <4 x float> %t 83} 84define <2 x double> @c1(<2 x double> %f) { 85 %a = call <2 x double> @test_vd(<2 x double> %f) 86 %t = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %f) 87 ret <2 x double> %t 88} 89define <2 x double> @d3(<2 x double> %f) { 90 %y = call <2 x double> @test_vd(<2 x double> %f) 91 %t = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %y, <2 x double> %f) 92 ret <2 x double> %t 93} 94define <2 x double> @d4(<2 x double> %f) { 95 %y = call <2 x double> @test_vd(<2 x double> %f) 96 %t = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %y, <2 x double> %f) 97 ret <2 x double> %t 98} 99define <2 x double> @d5(<2 x double> %f) { 100 %y = call <2 x double> @test_vd(<2 x double> %f) 101 %t = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %y, <2 x double> %f, i8 7) 102 ret <2 x double> %t 103} 104define <2 x double> @d6(<2 x double> %f) { 105 %y = call <2 x double> @test_vd(<2 x double> %f) 106 %t = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %y, <2 x double> %f) 107 ret <2 x double> %t 108} 109define <2 x double> @d7(<2 x double> %f) { 110 %y = call <2 x double> @test_vd(<2 x double> %f) 111 %t = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %y, <2 x double> %f) 112 ret <2 x double> %t 113} 114define <2 x double> @d8(<2 x double> %f) { 115 %y = call <2 x double> @test_vd(<2 x double> %f) 116 %t = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %y, <2 x double> %f) 117 ret <2 x double> %t 118} 119 120; This one should fail to fuse, but -regalloc=greedy isn't even trying. Instead 121; it produces: 122; callq test_vd 123; movapd (%rsp), %xmm1 # 16-byte Reload 124; hsubpd %xmm0, %xmm1 125; movapd %xmm1, %xmm0 126; addq $24, %rsp 127; ret 128; RABasic still tries to fold this one. 129 130define <2 x double> @z0(<2 x double> %f) { 131 %y = call <2 x double> @test_vd(<2 x double> %f) 132 %t = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %f, <2 x double> %y) 133 ret <2 x double> %t 134} 135