1; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 < %s | FileCheck %s
2
3target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
4target triple = "x86_64-unknown-unknown"
5
6; Stack reload folding tests.
7;
8; By including a nop call with sideeffects we can force a partial register spill of the
9; relevant registers and check that the reload is correctly folded into the instruction.
10
11define <2 x double> @stack_fold_addpd(<2 x double> %a0, <2 x double> %a1) {
12  ;CHECK-LABEL: stack_fold_addpd
13  ;CHECK:       addpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
14  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
15  %2 = fadd <2 x double> %a0, %a1
16  ret <2 x double> %2
17}
18
19define <4 x float> @stack_fold_addps(<4 x float> %a0, <4 x float> %a1) {
20  ;CHECK-LABEL: stack_fold_addps
21  ;CHECK:       addps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
22  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
23  %2 = fadd <4 x float> %a0, %a1
24  ret <4 x float> %2
25}
26
27define double @stack_fold_addsd(double %a0, double %a1) {
28  ;CHECK-LABEL: stack_fold_addsd
29  ;CHECK:       addsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
30  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
31  %2 = fadd double %a0, %a1
32  ret double %2
33}
34
35define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) {
36  ;CHECK-LABEL: stack_fold_addsd_int
37  ;CHECK:       addsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
38  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
39  %2 = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1)
40  ret <2 x double> %2
41}
42declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone
43
44define float @stack_fold_addss(float %a0, float %a1) {
45  ;CHECK-LABEL: stack_fold_addss
46  ;CHECK:       addss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
47  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
48  %2 = fadd float %a0, %a1
49  ret float %2
50}
51
52define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) {
53  ;CHECK-LABEL: stack_fold_addss_int
54  ;CHECK:       addss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
55  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
56  %2 = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1)
57  ret <4 x float> %2
58}
59declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone
60
61define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) {
62  ;CHECK-LABEL: stack_fold_addsubpd
63  ;CHECK:       addsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
64  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
65  %2 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
66  ret <2 x double> %2
67}
68declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
69
70define <4 x float> @stack_fold_addsubps(<4 x float> %a0, <4 x float> %a1) {
71  ;CHECK-LABEL: stack_fold_addsubps
72  ;CHECK:       addsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
73  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
74  %2 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
75  ret <4 x float> %2
76}
77declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
78
79define <2 x double> @stack_fold_andnpd(<2 x double> %a0, <2 x double> %a1) {
80  ;CHECK-LABEL: stack_fold_andnpd
81  ;CHECK:       andnpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
82  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
83  %2 = bitcast <2 x double> %a0 to <2 x i64>
84  %3 = bitcast <2 x double> %a1 to <2 x i64>
85  %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
86  %5 = and <2 x i64> %4, %3
87  %6 = bitcast <2 x i64> %5 to <2 x double>
88  ; fadd forces execution domain
89  %7 = fadd <2 x double> %6, <double 0x0, double 0x0>
90  ret <2 x double> %7
91}
92
93define <4 x float> @stack_fold_andnps(<4 x float> %a0, <4 x float> %a1) {
94  ;CHECK-LABEL: stack_fold_andnps
95  ;CHECK:       andnps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
96  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
97  %2 = bitcast <4 x float> %a0 to <2 x i64>
98  %3 = bitcast <4 x float> %a1 to <2 x i64>
99  %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
100  %5 = and <2 x i64> %4, %3
101  %6 = bitcast <2 x i64> %5 to <4 x float>
102  ; fadd forces execution domain
103  %7 = fadd <4 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0>
104  ret <4 x float> %7
105}
106
107define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) {
108  ;CHECK-LABEL: stack_fold_andpd
109  ;CHECK:       andpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
110  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
111  %2 = bitcast <2 x double> %a0 to <2 x i64>
112  %3 = bitcast <2 x double> %a1 to <2 x i64>
113  %4 = and <2 x i64> %2, %3
114  %5 = bitcast <2 x i64> %4 to <2 x double>
115  ; fadd forces execution domain
116  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
117  ret <2 x double> %6
118}
119
120define <4 x float> @stack_fold_andps(<4 x float> %a0, <4 x float> %a1) {
121  ;CHECK-LABEL: stack_fold_andps
122  ;CHECK:       andps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
123  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
124  %2 = bitcast <4 x float> %a0 to <2 x i64>
125  %3 = bitcast <4 x float> %a1 to <2 x i64>
126  %4 = and <2 x i64> %2, %3
127  %5 = bitcast <2 x i64> %4 to <4 x float>
128  ; fadd forces execution domain
129  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
130  ret <4 x float> %6
131}
132
133define <2 x double> @stack_fold_blendpd(<2 x double> %a0, <2 x double> %a1) {
134  ;CHECK-LABEL: stack_fold_blendpd
135  ;CHECK:       blendpd $2, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
136  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
137  %2 = select <2 x i1> <i1 1, i1 0>, <2 x double> %a0, <2 x double> %a1
138  ret <2 x double> %2
139}
140
141define <4 x float> @stack_fold_blendps(<4 x float> %a0, <4 x float> %a1) {
142  ;CHECK-LABEL: stack_fold_blendps
143  ;CHECK:       blendps $6, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
144  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
145  %2 = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x float> %a0, <4 x float> %a1
146  ret <4 x float> %2
147}
148
149define <2 x double> @stack_fold_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %c) {
150  ;CHECK-LABEL: stack_fold_blendvpd
151  ;CHECK:       blendvpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
152  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
153  %2 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a1, <2 x double> %c, <2 x double> %a0)
154  ret <2 x double> %2
155}
156declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
157
158define <4 x float> @stack_fold_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %c) {
159  ;CHECK-LABEL: stack_fold_blendvps
160  ;CHECK:       blendvps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
161  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
162  %2 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a1, <4 x float> %c, <4 x float> %a0)
163  ret <4 x float> %2
164}
165declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
166
167define <2 x double> @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) {
168  ;CHECK-LABEL: stack_fold_cmppd
169  ;CHECK:       cmpeqpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
170  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
171  %2 = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 0)
172  ret <2 x double> %2
173}
174declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
175
176define <4 x float> @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) {
177  ;CHECK-LABEL: stack_fold_cmpps
178  ;CHECK:       cmpeqps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
179  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
180  %2 = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 0)
181  ret <4 x float> %2
182}
183declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
184
185define i32 @stack_fold_cmpsd(double %a0, double %a1) {
186  ;CHECK-LABEL: stack_fold_cmpsd
187  ;CHECK:       cmpeqsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
188  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
189  %2 = fcmp oeq double %a0, %a1
190  %3 = zext i1 %2 to i32
191  ret i32 %3
192}
193
194define <2 x double> @stack_fold_cmpsd_int(<2 x double> %a0, <2 x double> %a1) {
195  ;CHECK-LABEL: stack_fold_cmpsd_int
196  ;CHECK:       cmpeqsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
197  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
198  %2 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 0)
199  ret <2 x double> %2
200}
201declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
202
203define i32 @stack_fold_cmpss(float %a0, float %a1) {
204  ;CHECK-LABEL: stack_fold_cmpss
205  ;CHECK:       cmpeqss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
206  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
207  %2 = fcmp oeq float %a0, %a1
208  %3 = zext i1 %2 to i32
209  ret i32 %3
210}
211
212define <4 x float> @stack_fold_cmpss_int(<4 x float> %a0, <4 x float> %a1) {
213  ;CHECK-LABEL: stack_fold_cmpss_int
214  ;CHECK:       cmpeqss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
215  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
216  %2 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
217  ret <4 x float> %2
218}
219declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
220
221; TODO stack_fold_comisd
222
223define i32 @stack_fold_comisd_int(<2 x double> %a0, <2 x double> %a1) {
224  ;CHECK-LABEL: stack_fold_comisd_int
225  ;CHECK:       comisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
226  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
227  %2 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
228  ret i32 %2
229}
230declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone
231
232; TODO stack_fold_comiss
233
234define i32 @stack_fold_comiss_int(<4 x float> %a0, <4 x float> %a1) {
235  ;CHECK-LABEL: stack_fold_comiss_int
236  ;CHECK:       comiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
237  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
238  %2 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
239  ret i32 %2
240}
241declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
242
243define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) {
244  ;CHECK-LABEL: stack_fold_cvtdq2pd
245  ;CHECK:       cvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
246  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
247  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
248  %3 = sitofp <2 x i32> %2 to <2 x double>
249  ret <2 x double> %3
250}
251
252define <2 x double> @stack_fold_cvtdq2pd_int(<4 x i32> %a0) {
253  ;CHECK-LABEL: stack_fold_cvtdq2pd_int
254  ;CHECK:       cvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
255  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
256  %2 = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0)
257  ret <2 x double> %2
258}
259declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
260
261define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) {
262  ;CHECK-LABEL: stack_fold_cvtdq2ps
263  ;CHECK:       cvtdq2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
264  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
265  %2 = sitofp <4 x i32> %a0 to <4 x float>
266  ret <4 x float> %2
267}
268
269define <4 x i32> @stack_fold_cvtpd2dq(<2 x double> %a0) {
270  ;CHECK-LABEL: stack_fold_cvtpd2dq
271  ;CHECK:       cvtpd2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
272  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
273  %2 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
274  ret <4 x i32> %2
275}
276declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
277
278define <2 x float> @stack_fold_cvtpd2ps(<2 x double> %a0) {
279  ;CHECK-LABEL: stack_fold_cvtpd2ps
280  ;CHECK:       cvtpd2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
281  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
282  %2 = fptrunc <2 x double> %a0 to <2 x float>
283  ret <2 x float> %2
284}
285
286define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) {
287  ;CHECK-LABEL: stack_fold_cvtps2dq
288  ;CHECK:       cvtps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
289  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
290  %2 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
291  ret <4 x i32> %2
292}
293declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
294
295define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) {
296  ;CHECK-LABEL: stack_fold_cvtps2pd
297  ;CHECK:       cvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
298  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
299  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
300  %3 = fpext <2 x float> %2 to <2 x double>
301  ret <2 x double> %3
302}
303
304define <2 x double> @stack_fold_cvtps2pd_int(<4 x float> %a0) {
305  ;CHECK-LABEL: stack_fold_cvtps2pd_int
306  ;CHECK:       cvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
307  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
308  %2 = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0)
309  ret <2 x double> %2
310}
311declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
312
313; TODO stack_fold_cvtsd2si
314
315define i32 @stack_fold_cvtsd2si_int(<2 x double> %a0) {
316  ;CHECK-LABEL: stack_fold_cvtsd2si_int
317  ;CHECK:       cvtsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
318  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
319  %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
320  ret i32 %2
321}
322declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
323
324; TODO stack_fold_cvtsd2si64
325
326define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) {
327  ;CHECK-LABEL: stack_fold_cvtsd2si64_int
328  ;CHECK:       cvtsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
329  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
330  %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0)
331  ret i64 %2
332}
333declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
334
335define float @stack_fold_cvtsd2ss(double %a0) minsize {
336  ;CHECK-LABEL: stack_fold_cvtsd2ss
337  ;CHECK:       cvtsd2ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
338  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
339  %2 = fptrunc double %a0 to float
340  ret float %2
341}
342
343define <4 x float> @stack_fold_cvtsd2ss_int(<2 x double> %a0) optsize {
344  ;CHECK-LABEL: stack_fold_cvtsd2ss_int
345  ;CHECK:       cvtsd2ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
346  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
347  %2 = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, <2 x double> %a0)
348  ret <4 x float> %2
349}
350declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
351
352define double @stack_fold_cvtsi2sd(i32 %a0) minsize {
353  ;CHECK-LABEL: stack_fold_cvtsi2sd
354  ;CHECK:       cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
355  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
356  %2 = sitofp i32 %a0 to double
357  ret double %2
358}
359
360define <2 x double> @stack_fold_cvtsi2sd_int(i32 %a0) {
361  ;CHECK-LABEL: stack_fold_cvtsi2sd_int
362  ;CHECK:       cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
363  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
364  %2 = call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> <double 0x0, double 0x0>, i32 %a0)
365  ret <2 x double> %2
366}
367declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
368
369define double @stack_fold_cvtsi642sd(i64 %a0) optsize {
370  ;CHECK-LABEL: stack_fold_cvtsi642sd
371  ;CHECK:       cvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
372  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
373  %2 = sitofp i64 %a0 to double
374  ret double %2
375}
376
377define <2 x double> @stack_fold_cvtsi642sd_int(i64 %a0) {
378  ;CHECK-LABEL: stack_fold_cvtsi642sd_int
379  ;CHECK:       cvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
380  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
381  %2 = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> <double 0x0, double 0x0>, i64 %a0)
382  ret <2 x double> %2
383}
384declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
385
386define float @stack_fold_cvtsi2ss(i32 %a0) minsize {
387  ;CHECK-LABEL: stack_fold_cvtsi2ss
388  ;CHECK:       cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
389  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
390  %2 = sitofp i32 %a0 to float
391  ret float %2
392}
393
394define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0) {
395  ;CHECK-LABEL: stack_fold_cvtsi2ss_int
396  ;CHECK:  cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
397  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
398  %2 = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, i32 %a0)
399  ret <4 x float> %2
400}
401declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
402
403define float @stack_fold_cvtsi642ss(i64 %a0) optsize {
404  ;CHECK-LABEL: stack_fold_cvtsi642ss
405  ;CHECK:       cvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
406  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
407  %2 = sitofp i64 %a0 to float
408  ret float %2
409}
410
411define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) {
412  ;CHECK-LABEL: stack_fold_cvtsi642ss_int
413  ;CHECK:  cvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
414  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
415  %2 = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, i64 %a0)
416  ret <4 x float> %2
417}
418declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
419
420define double @stack_fold_cvtss2sd(float %a0) minsize {
421  ;CHECK-LABEL: stack_fold_cvtss2sd
422  ;CHECK:       cvtss2sd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
423  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
424  %2 = fpext float %a0 to double
425  ret double %2
426}
427
428define <2 x double> @stack_fold_cvtss2sd_int(<4 x float> %a0) optsize {
429  ;CHECK-LABEL: stack_fold_cvtss2sd_int
430  ;CHECK:       cvtss2sd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
431  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
432  %2 = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %a0)
433  ret <2 x double> %2
434}
435declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone
436
437; TODO stack_fold_cvtss2si
438
439define i32 @stack_fold_cvtss2si_int(<4 x float> %a0) {
440  ;CHECK-LABEL: stack_fold_cvtss2si_int
441  ;CHECK:       cvtss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
442  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
443  %2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
444  ret i32 %2
445}
446declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
447
448; TODO stack_fold_cvtss2si64
449
450define i64 @stack_fold_cvtss2si64_int(<4 x float> %a0) {
451  ;CHECK-LABEL: stack_fold_cvtss2si64_int
452  ;CHECK:       cvtss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
453  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
454  %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0)
455  ret i64 %2
456}
457declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
458
459define <4 x i32> @stack_fold_cvttpd2dq(<2 x double> %a0) {
460  ;CHECK-LABEL: stack_fold_cvttpd2dq
461  ;CHECK:       cvttpd2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
462  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
463  %2 = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
464  ret <4 x i32> %2
465}
466declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
467
468define <4 x i32> @stack_fold_cvttps2dq(<4 x float> %a0) {
469  ;CHECK-LABEL: stack_fold_cvttps2dq
470  ;CHECK:       cvttps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
471  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
472  %2 = fptosi <4 x float> %a0 to <4 x i32>
473  ret <4 x i32> %2
474}
475
476define i32 @stack_fold_cvttsd2si(double %a0) {
477  ;CHECK-LABEL: stack_fold_cvttsd2si
478  ;CHECK:       cvttsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 8-byte Folded Reload
479  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
480  %2 = fptosi double %a0 to i32
481  ret i32 %2
482}
483
484define i32 @stack_fold_cvttsd2si_int(<2 x double> %a0) {
485  ;CHECK-LABEL: stack_fold_cvttsd2si_int
486  ;CHECK:       cvttsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
487  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
488  %2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0)
489  ret i32 %2
490}
491declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
492
493define i64 @stack_fold_cvttsd2si64(double %a0) {
494  ;CHECK-LABEL: stack_fold_cvttsd2si64
495  ;CHECK:       cvttsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
496  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
497  %2 = fptosi double %a0 to i64
498  ret i64 %2
499}
500
501define i64 @stack_fold_cvttsd2si64_int(<2 x double> %a0) {
502  ;CHECK-LABEL: stack_fold_cvttsd2si64_int
503  ;CHECK:       cvttsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
504  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
505  %2 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0)
506  ret i64 %2
507}
508declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
509
510define i32 @stack_fold_cvttss2si(float %a0) {
511  ;CHECK-LABEL: stack_fold_cvttss2si
512  ;CHECK:       cvttss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
513  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
514  %2 = fptosi float %a0 to i32
515  ret i32 %2
516}
517
518define i32 @stack_fold_cvttss2si_int(<4 x float> %a0) {
519  ;CHECK-LABEL: stack_fold_cvttss2si_int
520  ;CHECK:       cvttss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
521  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
522  %2 = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
523  ret i32 %2
524}
525declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
526
527define i64 @stack_fold_cvttss2si64(float %a0) {
528  ;CHECK-LABEL: stack_fold_cvttss2si64
529  ;CHECK:       cvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 4-byte Folded Reload
530  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
531  %2 = fptosi float %a0 to i64
532  ret i64 %2
533}
534
535define i64 @stack_fold_cvttss2si64_int(<4 x float> %a0) {
536  ;CHECK-LABEL: stack_fold_cvttss2si64_int
537  ;CHECK:       cvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
538  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
539  %2 = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
540  ret i64 %2
541}
542declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
543
544define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) {
545  ;CHECK-LABEL: stack_fold_divpd
546  ;CHECK:       divpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
547  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
548  %2 = fdiv <2 x double> %a0, %a1
549  ret <2 x double> %2
550}
551
552define <4 x float> @stack_fold_divps(<4 x float> %a0, <4 x float> %a1) {
553  ;CHECK-LABEL: stack_fold_divps
554  ;CHECK:       divps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
555  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
556  %2 = fdiv <4 x float> %a0, %a1
557  ret <4 x float> %2
558}
559
560define double @stack_fold_divsd(double %a0, double %a1) {
561  ;CHECK-LABEL: stack_fold_divsd
562  ;CHECK:       divsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
563  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
564  %2 = fdiv double %a0, %a1
565  ret double %2
566}
567
568define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {
569  ;CHECK-LABEL: stack_fold_divsd_int
570  ;CHECK:       divsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
571  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
572  %2 = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1)
573  ret <2 x double> %2
574}
575declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone
576
577define float @stack_fold_divss(float %a0, float %a1) {
578  ;CHECK-LABEL: stack_fold_divss
579  ;CHECK:       divss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
580  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
581  %2 = fdiv float %a0, %a1
582  ret float %2
583}
584
585define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) {
586  ;CHECK-LABEL: stack_fold_divss_int
587  ;CHECK:       divss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
588  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
589  %2 = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1)
590  ret <4 x float> %2
591}
592declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone
593
594define <2 x double> @stack_fold_dppd(<2 x double> %a0, <2 x double> %a1) {
595  ;CHECK-LABEL: stack_fold_dppd
596  ;CHECK:       dppd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
597  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
598  %2 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
599  ret <2 x double> %2
600}
601declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
602
603define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) {
604  ;CHECK-LABEL: stack_fold_dpps
605  ;CHECK:       dpps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
606  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
607  %2 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
608  ret <4 x float> %2
609}
610declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
611
612define i32 @stack_fold_extractps(<4 x float> %a0) {
613  ;CHECK-LABEL: stack_fold_extractps
614  ;CHECK:       extractps $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
615  ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
616  %1 = extractelement <4 x float> %a0, i32 1
617  %2 = bitcast float %1 to i32
618  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
619  ret i32 %2
620}
621
622define <2 x double> @stack_fold_haddpd(<2 x double> %a0, <2 x double> %a1) {
623  ;CHECK-LABEL: stack_fold_haddpd
624  ;CHECK:       haddpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
625  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
626  %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
627  ret <2 x double> %2
628}
629declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone
630
631define <4 x float> @stack_fold_haddps(<4 x float> %a0, <4 x float> %a1) {
632  ;CHECK-LABEL: stack_fold_haddps
633  ;CHECK:       haddps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
634  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
635  %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
636  ret <4 x float> %2
637}
638declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
639
640define <2 x double> @stack_fold_hsubpd(<2 x double> %a0, <2 x double> %a1) {
641  ;CHECK-LABEL: stack_fold_hsubpd
642  ;CHECK:       hsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
643  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
644  %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
645  ret <2 x double> %2
646}
647declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone
648
649define <4 x float> @stack_fold_hsubps(<4 x float> %a0, <4 x float> %a1) {
650  ;CHECK-LABEL: stack_fold_hsubps
651  ;CHECK:       hsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
652  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
653  %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
654  ret <4 x float> %2
655}
656declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
657
658define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
659  ;CHECK-LABEL: stack_fold_insertps
660  ;CHECK:       insertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
661  ;CHECK-NEXT:                                                        {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3]
662  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
663  %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209)
664  ret <4 x float> %2
665}
666declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
667
668define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
669  ;CHECK-LABEL: stack_fold_maxpd
670  ;CHECK:       maxpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
671  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
672  %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
673  ret <2 x double> %2
674}
675declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
676
677define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) {
678  ;CHECK-LABEL: stack_fold_maxps
679  ;CHECK:       maxps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
680  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
681  %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
682  ret <4 x float> %2
683}
684declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
685
686define double @stack_fold_maxsd(double %a0, double %a1) {
687  ;CHECK-LABEL: stack_fold_maxsd
688  ;CHECK:       maxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
689  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
690  %2 = fcmp ogt double %a0, %a1
691  %3 = select i1 %2, double %a0, double %a1
692  ret double %3
693}
694
695define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) {
696  ;CHECK-LABEL: stack_fold_maxsd_int
697  ;CHECK:       maxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
698  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
699  %2 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
700  ret <2 x double> %2
701}
702declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
703
704define float @stack_fold_maxss(float %a0, float %a1) {
705  ;CHECK-LABEL: stack_fold_maxss
706  ;CHECK:       maxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
707  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
708  %2 = fcmp ogt float %a0, %a1
709  %3 = select i1 %2, float %a0, float %a1
710  ret float %3
711}
712
713define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) {
714  ;CHECK-LABEL: stack_fold_maxss_int
715  ;CHECK:       maxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
716  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
717  %2 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
718  ret <4 x float> %2
719}
720declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
721
722define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) {
723  ;CHECK-LABEL: stack_fold_minpd
724  ;CHECK:       minpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
725  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
726  %2 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
727  ret <2 x double> %2
728}
729declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
730
731define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) {
732  ;CHECK-LABEL: stack_fold_minps
733  ;CHECK:       minps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
734  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
735  %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
736  ret <4 x float> %2
737}
738declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
739
740define double @stack_fold_minsd(double %a0, double %a1) {
741  ;CHECK-LABEL: stack_fold_minsd
742  ;CHECK:       minsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
743  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
744  %2 = fcmp olt double %a0, %a1
745  %3 = select i1 %2, double %a0, double %a1
746  ret double %3
747}
748
749define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) {
750  ;CHECK-LABEL: stack_fold_minsd_int
751  ;CHECK:       minsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
752  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
753  %2 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
754  ret <2 x double> %2
755}
756declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
757
758define float @stack_fold_minss(float %a0, float %a1) {
759  ;CHECK-LABEL: stack_fold_minss
760  ;CHECK:       minss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
761  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
762  %2 = fcmp olt float %a0, %a1
763  %3 = select i1 %2, float %a0, float %a1
764  ret float %3
765}
766
767define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) {
768  ;CHECK-LABEL: stack_fold_minss_int
769  ;CHECK:       minss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
770  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
771  %2 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
772  ret <4 x float> %2
773}
774declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
775
776define <2 x double> @stack_fold_movddup(<2 x double> %a0) {
777  ;CHECK-LABEL: stack_fold_movddup
778  ;CHECK:   movddup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
779  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
780  %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 0, i32 0>
781  ret <2 x double> %2
782}
783; TODO stack_fold_movhpd (load / store)
784; TODO stack_fold_movhps (load / store)
785
786; TODO stack_fold_movlpd (load / store)
787; TODO stack_fold_movlps (load / store)
788
789define <4 x float> @stack_fold_movshdup(<4 x float> %a0) {
790  ;CHECK-LABEL: stack_fold_movshdup
791  ;CHECK:       movshdup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
792  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
793  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
794  ret <4 x float> %2
795}
796
797define <4 x float> @stack_fold_movsldup(<4 x float> %a0) {
798  ;CHECK-LABEL: stack_fold_movsldup
799  ;CHECK:       movsldup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
800  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
801  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
802  ret <4 x float> %2
803}
804
805define <2 x double> @stack_fold_mulpd(<2 x double> %a0, <2 x double> %a1) {
806  ;CHECK-LABEL: stack_fold_mulpd
807  ;CHECK:       mulpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
808  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
809  %2 = fmul <2 x double> %a0, %a1
810  ret <2 x double> %2
811}
812
813define <4 x float> @stack_fold_mulps(<4 x float> %a0, <4 x float> %a1) {
814  ;CHECK-LABEL: stack_fold_mulps
815  ;CHECK:       mulps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
816  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
817  %2 = fmul <4 x float> %a0, %a1
818  ret <4 x float> %2
819}
820
821define double @stack_fold_mulsd(double %a0, double %a1) {
822  ;CHECK-LABEL: stack_fold_mulsd
823  ;CHECK:       mulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
824  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
825  %2 = fmul double %a0, %a1
826  ret double %2
827}
828
829define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) {
830  ;CHECK-LABEL: stack_fold_mulsd_int
831  ;CHECK:       mulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
832  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
833  %2 = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1)
834  ret <2 x double> %2
835}
836declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind readnone
837
838define float @stack_fold_mulss(float %a0, float %a1) {
839  ;CHECK-LABEL: stack_fold_mulss
840  ;CHECK:       mulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
841  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
842  %2 = fmul float %a0, %a1
843  ret float %2
844}
845
846define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
847  ;CHECK-LABEL: stack_fold_mulss_int
848  ;CHECK:       mulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
849  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
850  %2 = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1)
851  ret <4 x float> %2
852}
853declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind readnone
854
855define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) {
856  ;CHECK-LABEL: stack_fold_orpd
857  ;CHECK:       orpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
858  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
859  %2 = bitcast <2 x double> %a0 to <2 x i64>
860  %3 = bitcast <2 x double> %a1 to <2 x i64>
861  %4 = or <2 x i64> %2, %3
862  %5 = bitcast <2 x i64> %4 to <2 x double>
863  ; fadd forces execution domain
864  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
865  ret <2 x double> %6
866}
867
868define <4 x float> @stack_fold_orps(<4 x float> %a0, <4 x float> %a1) {
869  ;CHECK-LABEL: stack_fold_orps
870  ;CHECK:       orps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
871  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
872  %2 = bitcast <4 x float> %a0 to <2 x i64>
873  %3 = bitcast <4 x float> %a1 to <2 x i64>
874  %4 = or <2 x i64> %2, %3
875  %5 = bitcast <2 x i64> %4 to <4 x float>
876  ; fadd forces execution domain
877  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
878  ret <4 x float> %6
879}
880
881; TODO stack_fold_rcpps
882
883define <4 x float> @stack_fold_rcpps_int(<4 x float> %a0) {
884  ;CHECK-LABEL: stack_fold_rcpps_int
885  ;CHECK:       rcpps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
886  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
887  %2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
888  ret <4 x float> %2
889}
890declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
891
892; TODO stack_fold_rcpss
893; TODO stack_fold_rcpss_int
894
895define <2 x double> @stack_fold_roundpd(<2 x double> %a0) {
896  ;CHECK-LABEL: stack_fold_roundpd
897  ;CHECK:       roundpd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
898  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
899  %2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7)
900  ret <2 x double> %2
901}
902declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
903
904define <4 x float> @stack_fold_roundps(<4 x float> %a0) {
905  ;CHECK-LABEL: stack_fold_roundps
906  ;CHECK:       roundps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
907  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
908  %2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7)
909  ret <4 x float> %2
910}
911declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
912
913define double @stack_fold_roundsd(double %a0) optsize {
914  ;CHECK-LABEL: stack_fold_roundsd
915  ;CHECK:       roundsd $9, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
916  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
917  %2 = call double @llvm.floor.f64(double %a0)
918  ret double %2
919}
920declare double @llvm.floor.f64(double) nounwind readnone
921
922; TODO stack_fold_roundsd_int
923declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
924
925define float @stack_fold_roundss(float %a0) minsize {
926  ;CHECK-LABEL: stack_fold_roundss
927  ;CHECK:       roundss $9, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
928  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
929  %2 = call float @llvm.floor.f32(float %a0)
930  ret float %2
931}
932declare float @llvm.floor.f32(float) nounwind readnone
933
934; TODO stack_fold_roundss_int
935declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
936
937; TODO stack_fold_rsqrtps
938
939define <4 x float> @stack_fold_rsqrtps_int(<4 x float> %a0) {
940  ;CHECK-LABEL: stack_fold_rsqrtps_int
941  ;CHECK:       rsqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
942  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
943  %2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
944  ret <4 x float> %2
945}
946declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
947
948; TODO stack_fold_rsqrtss
949; TODO stack_fold_rsqrtss_int
950
951define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) {
952  ;CHECK-LABEL: stack_fold_shufpd
953  ;CHECK:       shufpd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
954  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
955  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
956  ret <2 x double> %2
957}
958
959define <4 x float> @stack_fold_shufps(<4 x float> %a0, <4 x float> %a1) {
960  ;CHECK-LABEL: stack_fold_shufps
961  ;CHECK:       shufps $200, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
962  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
963  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 2, i32 4, i32 7>
964  ret <4 x float> %2
965}
966
967define <2 x double> @stack_fold_sqrtpd(<2 x double> %a0) {
968  ;CHECK-LABEL: stack_fold_sqrtpd
969  ;CHECK:       sqrtpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
970  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
971  %2 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0)
972  ret <2 x double> %2
973}
974declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
975
976define <4 x float> @stack_fold_sqrtps(<4 x float> %a0) {
977  ;CHECK-LABEL: stack_fold_sqrtps
978  ;CHECK:       sqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
979  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
980  %2 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
981  ret <4 x float> %2
982}
983declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
984
985define double @stack_fold_sqrtsd(double %a0) optsize {
986  ;CHECK-LABEL: stack_fold_sqrtsd
987  ;CHECK:       sqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
988  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
989  %2 = call double @llvm.sqrt.f64(double %a0)
990  ret double %2
991}
992declare double @llvm.sqrt.f64(double) nounwind readnone
993
994; TODO stack_fold_sqrtsd_int
995declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
996
997define float @stack_fold_sqrtss(float %a0) minsize {
998  ;CHECK-LABEL: stack_fold_sqrtss
999  ;CHECK:       sqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
1000  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1001  %2 = call float @llvm.sqrt.f32(float %a0)
1002  ret float %2
1003}
1004declare float @llvm.sqrt.f32(float) nounwind readnone
1005
1006; TODO stack_fold_sqrtss_int
1007declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
1008
1009define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) {
1010  ;CHECK-LABEL: stack_fold_subpd
1011  ;CHECK:       subpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1012  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1013  %2 = fsub <2 x double> %a0, %a1
1014  ret <2 x double> %2
1015}
1016
1017define <4 x float> @stack_fold_subps(<4 x float> %a0, <4 x float> %a1) {
1018  ;CHECK-LABEL: stack_fold_subps
1019  ;CHECK:       subps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1020  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1021  %2 = fsub <4 x float> %a0, %a1
1022  ret <4 x float> %2
1023}
1024
1025define double @stack_fold_subsd(double %a0, double %a1) {
1026  ;CHECK-LABEL: stack_fold_subsd
1027  ;CHECK:       subsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
1028  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1029  %2 = fsub double %a0, %a1
1030  ret double %2
1031}
1032
1033define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) {
1034  ;CHECK-LABEL: stack_fold_subsd_int
1035  ;CHECK:       subsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1036  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1037  %2 = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1)
1038  ret <2 x double> %2
1039}
1040declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind readnone
1041
1042define float @stack_fold_subss(float %a0, float %a1) {
1043  ;CHECK-LABEL: stack_fold_subss
1044  ;CHECK:       subss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
1045  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1046  %2 = fsub float %a0, %a1
1047  ret float %2
1048}
1049
1050define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
1051  ;CHECK-LABEL: stack_fold_subss_int
1052  ;CHECK:       subss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1053  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1054  %2 = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1)
1055  ret <4 x float> %2
1056}
1057declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind readnone
1058
1059define i32 @stack_fold_ucomisd(double %a0, double %a1) {
1060  ;CHECK-LABEL: stack_fold_ucomisd
1061  ;CHECK:       ucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
1062  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1063  %2 = fcmp ueq double %a0, %a1
1064  %3 = select i1 %2, i32 1, i32 -1
1065  ret i32 %3
1066}
1067
1068define i32 @stack_fold_ucomisd_int(<2 x double> %a0, <2 x double> %a1) {
1069  ;CHECK-LABEL: stack_fold_ucomisd_int
1070  ;CHECK:       ucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1071  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1072  %2 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
1073  ret i32 %2
1074}
1075declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone
1076
1077define i32 @stack_fold_ucomiss(float %a0, float %a1) {
1078  ;CHECK-LABEL: stack_fold_ucomiss
1079  ;CHECK:       ucomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
1080  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1081  %2 = fcmp ueq float %a0, %a1
1082  %3 = select i1 %2, i32 1, i32 -1
1083  ret i32 %3
1084}
1085
1086define i32 @stack_fold_ucomiss_int(<4 x float> %a0, <4 x float> %a1) {
1087  ;CHECK-LABEL: stack_fold_ucomiss_int
1088  ;CHECK:       ucomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1089  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1090  %2 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
1091  ret i32 %2
1092}
1093declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
1094
1095define <2 x double> @stack_fold_unpckhpd(<2 x double> %a0, <2 x double> %a1) {
1096  ;CHECK-LABEL: stack_fold_unpckhpd
1097  ;CHECK:       unpckhpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1098  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1099  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
1100  ; fadd forces execution domain
1101  %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
1102  ret <2 x double> %3
1103}
1104
1105define <4 x float> @stack_fold_unpckhps(<4 x float> %a0, <4 x float> %a1) {
1106  ;CHECK-LABEL: stack_fold_unpckhps
1107  ;CHECK:       unpckhps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1108  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1109  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
1110  ; fadd forces execution domain
1111  %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
1112  ret <4 x float> %3
1113}
1114
1115define <2 x double> @stack_fold_unpcklpd(<2 x double> %a0, <2 x double> %a1) {
1116  ;CHECK-LABEL: stack_fold_unpcklpd
1117  ;CHECK:       unpcklpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1118  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1119  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
1120  ; fadd forces execution domain
1121  %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
1122  ret <2 x double> %3
1123}
1124
1125define <4 x float> @stack_fold_unpcklps(<4 x float> %a0, <4 x float> %a1) {
1126  ;CHECK-LABEL: stack_fold_unpcklps
1127  ;CHECK:       unpcklps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1128  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1129  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
1130  ; fadd forces execution domain
1131  %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
1132  ret <4 x float> %3
1133}
1134
1135define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) {
1136  ;CHECK-LABEL: stack_fold_xorpd
1137  ;CHECK:       xorpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1138  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1139  %2 = bitcast <2 x double> %a0 to <2 x i64>
1140  %3 = bitcast <2 x double> %a1 to <2 x i64>
1141  %4 = xor <2 x i64> %2, %3
1142  %5 = bitcast <2 x i64> %4 to <2 x double>
1143  ; fadd forces execution domain
1144  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
1145  ret <2 x double> %6
1146}
1147
1148define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) {
1149  ;CHECK-LABEL: stack_fold_xorps
1150  ;CHECK:       xorps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1151  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1152  %2 = bitcast <4 x float> %a0 to <2 x i64>
1153  %3 = bitcast <4 x float> %a1 to <2 x i64>
1154  %4 = xor <2 x i64> %2, %3
1155  %5 = bitcast <2 x i64> %4 to <4 x float>
1156  ; fadd forces execution domain
1157  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
1158  ret <4 x float> %6
1159}
1160