1; RUN: llc < %s -mtriple=x86_64-linux -mattr=+64bit,+sse3 -print-failed-fuse-candidates -regalloc=basic |& FileCheck %s
2; CHECK: fail
3; CHECK-NOT: fail
4
5declare float @test_f(float %f)
6declare double @test_d(double %f)
7declare <4 x float> @test_vf(<4 x float> %f)
8declare <2 x double> @test_vd(<2 x double> %f)
9declare float @llvm.sqrt.f32(float)
10declare double @llvm.sqrt.f64(double)
11
12declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>)
13declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>)
14declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>)
15declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>)
16declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>)
17declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8)
18declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>)
19declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)
20declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>)
21declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>)
22declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>)
23declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>)
24declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8)
25declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>)
26declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>)
27declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>)
28
29define float @foo(float %f) {
30  %a = call float @test_f(float %f)
31  %t = call float @llvm.sqrt.f32(float %f)
32  ret float %t
33}
34define double @doo(double %f) {
35  %a = call double @test_d(double %f)
36  %t = call double @llvm.sqrt.f64(double %f)
37  ret double %t
38}
39define <4 x float> @a0(<4 x float> %f) {
40  %a = call <4 x float> @test_vf(<4 x float> %f)
41  %t = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %f)
42  ret <4 x float> %t
43}
44define <4 x float> @a1(<4 x float> %f) {
45  %a = call <4 x float> @test_vf(<4 x float> %f)
46  %t = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %f)
47  ret <4 x float> %t
48}
49define <4 x float> @a2(<4 x float> %f) {
50  %a = call <4 x float> @test_vf(<4 x float> %f)
51  %t = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %f)
52  ret <4 x float> %t
53}
54define <4 x float> @b3(<4 x float> %f) {
55  %y = call <4 x float> @test_vf(<4 x float> %f)
56  %t = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %y, <4 x float> %f)
57  ret <4 x float> %t
58}
59define <4 x float> @b4(<4 x float> %f) {
60  %y = call <4 x float> @test_vf(<4 x float> %f)
61  %t = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %y, <4 x float> %f)
62  ret <4 x float> %t
63}
64define <4 x float> @b5(<4 x float> %f) {
65  %y = call <4 x float> @test_vf(<4 x float> %f)
66  %t = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %y, <4 x float> %f, i8 7)
67  ret <4 x float> %t
68}
69define <4 x float> @b6(<4 x float> %f) {
70  %y = call <4 x float> @test_vf(<4 x float> %f)
71  %t = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %y, <4 x float> %f)
72  ret <4 x float> %t
73}
74define <4 x float> @b7(<4 x float> %f) {
75  %y = call <4 x float> @test_vf(<4 x float> %f)
76  %t = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %y, <4 x float> %f)
77  ret <4 x float> %t
78}
79define <4 x float> @b8(<4 x float> %f) {
80  %y = call <4 x float> @test_vf(<4 x float> %f)
81  %t = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %y, <4 x float> %f)
82  ret <4 x float> %t
83}
84define <2 x double> @c1(<2 x double> %f) {
85  %a = call <2 x double> @test_vd(<2 x double> %f)
86  %t = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %f)
87  ret <2 x double> %t
88}
89define <2 x double> @d3(<2 x double> %f) {
90  %y = call <2 x double> @test_vd(<2 x double> %f)
91  %t = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %y, <2 x double> %f)
92  ret <2 x double> %t
93}
94define <2 x double> @d4(<2 x double> %f) {
95  %y = call <2 x double> @test_vd(<2 x double> %f)
96  %t = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %y, <2 x double> %f)
97  ret <2 x double> %t
98}
99define <2 x double> @d5(<2 x double> %f) {
100  %y = call <2 x double> @test_vd(<2 x double> %f)
101  %t = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %y, <2 x double> %f, i8 7)
102  ret <2 x double> %t
103}
104define <2 x double> @d6(<2 x double> %f) {
105  %y = call <2 x double> @test_vd(<2 x double> %f)
106  %t = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %y, <2 x double> %f)
107  ret <2 x double> %t
108}
109define <2 x double> @d7(<2 x double> %f) {
110  %y = call <2 x double> @test_vd(<2 x double> %f)
111  %t = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %y, <2 x double> %f)
112  ret <2 x double> %t
113}
114define <2 x double> @d8(<2 x double> %f) {
115  %y = call <2 x double> @test_vd(<2 x double> %f)
116  %t = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %y, <2 x double> %f)
117  ret <2 x double> %t
118}
119
120; This one should fail to fuse, but -regalloc=greedy isn't even trying. Instead
121; it produces:
122;   callq	test_vd
123;   movapd	(%rsp), %xmm1           # 16-byte Reload
124;   hsubpd	%xmm0, %xmm1
125;   movapd	%xmm1, %xmm0
126;   addq	$24, %rsp
127;   ret
128; RABasic still tries to fold this one.
129
130define <2 x double> @z0(<2 x double> %f) {
131  %y = call <2 x double> @test_vd(<2 x double> %f)
132  %t = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %f, <2 x double> %y)
133  ret <2 x double> %t
134}
135