1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s --check-prefix=X32
3; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s --check-prefix=X64
4; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefix=X32_AVX --check-prefix=X32_AVX1
5; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefix=X64_AVX --check-prefix=X64_AVX1
6; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefix=X32_AVX --check-prefix=X32_AVX512
7; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefix=X64_AVX --check-prefix=X64_AVX512
8
9define i16 @test1(float %f) nounwind {
10; X32-LABEL: test1:
11; X32:       ## %bb.0:
12; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
13; X32-NEXT:    addss LCPI0_0, %xmm0
14; X32-NEXT:    mulss LCPI0_1, %xmm0
15; X32-NEXT:    xorps %xmm1, %xmm1
16; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
17; X32-NEXT:    minss LCPI0_2, %xmm0
18; X32-NEXT:    maxss %xmm1, %xmm0
19; X32-NEXT:    cvttss2si %xmm0, %eax
20; X32-NEXT:    ## kill: def $ax killed $ax killed $eax
21; X32-NEXT:    retl
22;
23; X64-LABEL: test1:
24; X64:       ## %bb.0:
25; X64-NEXT:    addss {{.*}}(%rip), %xmm0
26; X64-NEXT:    mulss {{.*}}(%rip), %xmm0
27; X64-NEXT:    xorps %xmm1, %xmm1
28; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
29; X64-NEXT:    minss {{.*}}(%rip), %xmm0
30; X64-NEXT:    maxss %xmm1, %xmm0
31; X64-NEXT:    cvttss2si %xmm0, %eax
32; X64-NEXT:    ## kill: def $ax killed $ax killed $eax
33; X64-NEXT:    retq
34;
35; X32_AVX1-LABEL: test1:
36; X32_AVX1:       ## %bb.0:
37; X32_AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
38; X32_AVX1-NEXT:    vaddss LCPI0_0, %xmm0, %xmm0
39; X32_AVX1-NEXT:    vmulss LCPI0_1, %xmm0, %xmm0
40; X32_AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
41; X32_AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
42; X32_AVX1-NEXT:    vminss LCPI0_2, %xmm0, %xmm0
43; X32_AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
44; X32_AVX1-NEXT:    vcvttss2si %xmm0, %eax
45; X32_AVX1-NEXT:    ## kill: def $ax killed $ax killed $eax
46; X32_AVX1-NEXT:    retl
47;
48; X64_AVX1-LABEL: test1:
49; X64_AVX1:       ## %bb.0:
50; X64_AVX1-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
51; X64_AVX1-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
52; X64_AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
53; X64_AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
54; X64_AVX1-NEXT:    vminss {{.*}}(%rip), %xmm0, %xmm0
55; X64_AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
56; X64_AVX1-NEXT:    vcvttss2si %xmm0, %eax
57; X64_AVX1-NEXT:    ## kill: def $ax killed $ax killed $eax
58; X64_AVX1-NEXT:    retq
59;
60; X32_AVX512-LABEL: test1:
61; X32_AVX512:       ## %bb.0:
62; X32_AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
63; X32_AVX512-NEXT:    vaddss LCPI0_0, %xmm0, %xmm0
64; X32_AVX512-NEXT:    vmulss LCPI0_1, %xmm0, %xmm0
65; X32_AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
66; X32_AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
67; X32_AVX512-NEXT:    vminss LCPI0_2, %xmm0, %xmm0
68; X32_AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
69; X32_AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
70; X32_AVX512-NEXT:    vcvttss2si %xmm0, %eax
71; X32_AVX512-NEXT:    ## kill: def $ax killed $ax killed $eax
72; X32_AVX512-NEXT:    retl
73;
74; X64_AVX512-LABEL: test1:
75; X64_AVX512:       ## %bb.0:
76; X64_AVX512-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
77; X64_AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
78; X64_AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
79; X64_AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
80; X64_AVX512-NEXT:    vminss {{.*}}(%rip), %xmm0, %xmm0
81; X64_AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
82; X64_AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
83; X64_AVX512-NEXT:    vcvttss2si %xmm0, %eax
84; X64_AVX512-NEXT:    ## kill: def $ax killed $ax killed $eax
85; X64_AVX512-NEXT:    retq
86  %tmp = insertelement <4 x float> undef, float %f, i32 0		; <<4 x float>> [#uses=1]
87  %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1		; <<4 x float>> [#uses=1]
88  %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2		; <<4 x float>> [#uses=1]
89  %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3		; <<4 x float>> [#uses=1]
90  %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )		; <<4 x float>> [#uses=1]
91  %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )		; <<4 x float>> [#uses=1]
92  %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )		; <<4 x float>> [#uses=1]
93  %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer )		; <<4 x float>> [#uses=1]
94  %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 )		; <i32> [#uses=1]
95  %tmp69 = trunc i32 %tmp.upgrd.1 to i16		; <i16> [#uses=1]
96  ret i16 %tmp69
97}
98
99define i16 @test2(float %f) nounwind {
100; X32-LABEL: test2:
101; X32:       ## %bb.0:
102; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
103; X32-NEXT:    addss LCPI1_0, %xmm0
104; X32-NEXT:    mulss LCPI1_1, %xmm0
105; X32-NEXT:    minss LCPI1_2, %xmm0
106; X32-NEXT:    xorps %xmm1, %xmm1
107; X32-NEXT:    maxss %xmm1, %xmm0
108; X32-NEXT:    cvttss2si %xmm0, %eax
109; X32-NEXT:    ## kill: def $ax killed $ax killed $eax
110; X32-NEXT:    retl
111;
112; X64-LABEL: test2:
113; X64:       ## %bb.0:
114; X64-NEXT:    addss {{.*}}(%rip), %xmm0
115; X64-NEXT:    mulss {{.*}}(%rip), %xmm0
116; X64-NEXT:    minss {{.*}}(%rip), %xmm0
117; X64-NEXT:    xorps %xmm1, %xmm1
118; X64-NEXT:    maxss %xmm1, %xmm0
119; X64-NEXT:    cvttss2si %xmm0, %eax
120; X64-NEXT:    ## kill: def $ax killed $ax killed $eax
121; X64-NEXT:    retq
122;
123; X32_AVX-LABEL: test2:
124; X32_AVX:       ## %bb.0:
125; X32_AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
126; X32_AVX-NEXT:    vaddss LCPI1_0, %xmm0, %xmm0
127; X32_AVX-NEXT:    vmulss LCPI1_1, %xmm0, %xmm0
128; X32_AVX-NEXT:    vminss LCPI1_2, %xmm0, %xmm0
129; X32_AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
130; X32_AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
131; X32_AVX-NEXT:    vcvttss2si %xmm0, %eax
132; X32_AVX-NEXT:    ## kill: def $ax killed $ax killed $eax
133; X32_AVX-NEXT:    retl
134;
135; X64_AVX-LABEL: test2:
136; X64_AVX:       ## %bb.0:
137; X64_AVX-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
138; X64_AVX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
139; X64_AVX-NEXT:    vminss {{.*}}(%rip), %xmm0, %xmm0
140; X64_AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
141; X64_AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
142; X64_AVX-NEXT:    vcvttss2si %xmm0, %eax
143; X64_AVX-NEXT:    ## kill: def $ax killed $ax killed $eax
144; X64_AVX-NEXT:    retq
145  %tmp28 = fsub float %f, 1.000000e+00		; <float> [#uses=1]
146  %tmp37 = fmul float %tmp28, 5.000000e-01		; <float> [#uses=1]
147  %tmp375 = insertelement <4 x float> undef, float %tmp37, i32 0		; <<4 x float>> [#uses=1]
148  %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp375, <4 x float> < float 6.553500e+04, float undef, float undef, float undef > )		; <<4 x float>> [#uses=1]
149  %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> < float 0.000000e+00, float undef, float undef, float undef > )		; <<4 x float>> [#uses=1]
150  %tmp = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 )		; <i32> [#uses=1]
151  %tmp69 = trunc i32 %tmp to i16		; <i16> [#uses=1]
152  ret i16 %tmp69
153}
154
155declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>)
156
157declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>)
158
159declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
160
161declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
162
163declare i32 @llvm.x86.sse.cvttss2si(<4 x float>)
164
165declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32)
166
167declare <4 x float> @f()
168
169define <4 x float> @test3(<4 x float> %A, float *%b, i32 %C) nounwind {
170; X32-LABEL: test3:
171; X32:       ## %bb.0:
172; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
173; X32-NEXT:    roundss $4, (%eax), %xmm0
174; X32-NEXT:    retl
175;
176; X64-LABEL: test3:
177; X64:       ## %bb.0:
178; X64-NEXT:    roundss $4, (%rdi), %xmm0
179; X64-NEXT:    retq
180;
181; X32_AVX-LABEL: test3:
182; X32_AVX:       ## %bb.0:
183; X32_AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
184; X32_AVX-NEXT:    vroundss $4, (%eax), %xmm0, %xmm0
185; X32_AVX-NEXT:    retl
186;
187; X64_AVX-LABEL: test3:
188; X64_AVX:       ## %bb.0:
189; X64_AVX-NEXT:    vroundss $4, (%rdi), %xmm0, %xmm0
190; X64_AVX-NEXT:    retq
191  %a = load float , float *%b
192  %B = insertelement <4 x float> undef, float %a, i32 0
193  %X = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %A, <4 x float> %B, i32 4)
194  ret <4 x float> %X
195}
196
197define <4 x float> @test4(<4 x float> %A, float *%b, i32 %C) nounwind {
198; X32-LABEL: test4:
199; X32:       ## %bb.0:
200; X32-NEXT:    subl $28, %esp
201; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
202; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
203; X32-NEXT:    movaps %xmm0, (%esp) ## 16-byte Spill
204; X32-NEXT:    calll _f
205; X32-NEXT:    roundss $4, (%esp), %xmm0 ## 16-byte Folded Reload
206; X32-NEXT:    addl $28, %esp
207; X32-NEXT:    retl
208;
209; X64-LABEL: test4:
210; X64:       ## %bb.0:
211; X64-NEXT:    subq $24, %rsp
212; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
213; X64-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
214; X64-NEXT:    callq _f
215; X64-NEXT:    roundss $4, (%rsp), %xmm0 ## 16-byte Folded Reload
216; X64-NEXT:    addq $24, %rsp
217; X64-NEXT:    retq
218;
219; X32_AVX-LABEL: test4:
220; X32_AVX:       ## %bb.0:
221; X32_AVX-NEXT:    subl $28, %esp
222; X32_AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
223; X32_AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
224; X32_AVX-NEXT:    vmovaps %xmm0, (%esp) ## 16-byte Spill
225; X32_AVX-NEXT:    calll _f
226; X32_AVX-NEXT:    vroundss $4, (%esp), %xmm0, %xmm0 ## 16-byte Folded Reload
227; X32_AVX-NEXT:    addl $28, %esp
228; X32_AVX-NEXT:    retl
229;
230; X64_AVX-LABEL: test4:
231; X64_AVX:       ## %bb.0:
232; X64_AVX-NEXT:    subq $24, %rsp
233; X64_AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
234; X64_AVX-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
235; X64_AVX-NEXT:    callq _f
236; X64_AVX-NEXT:    vroundss $4, (%rsp), %xmm0, %xmm0 ## 16-byte Folded Reload
237; X64_AVX-NEXT:    addq $24, %rsp
238; X64_AVX-NEXT:    retq
239  %a = load float , float *%b
240  %B = insertelement <4 x float> undef, float %a, i32 0
241  %q = call <4 x float> @f()
242  %X = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %q, <4 x float> %B, i32 4)
243  ret <4 x float> %X
244}
245
246; PR13576
247define  <2 x double> @test5() nounwind uwtable readnone noinline {
248; X32-LABEL: test5:
249; X32:       ## %bb.0: ## %entry
250; X32-NEXT:    movaps {{.*#+}} xmm0 = [1.28E+2,1.23321E+2]
251; X32-NEXT:    retl
252;
253; X64-LABEL: test5:
254; X64:       ## %bb.0: ## %entry
255; X64-NEXT:    movaps {{.*#+}} xmm0 = [1.28E+2,1.23321E+2]
256; X64-NEXT:    retq
257;
258; X32_AVX-LABEL: test5:
259; X32_AVX:       ## %bb.0: ## %entry
260; X32_AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1.28E+2,1.23321E+2]
261; X32_AVX-NEXT:    retl
262;
263; X64_AVX-LABEL: test5:
264; X64_AVX:       ## %bb.0: ## %entry
265; X64_AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1.28E+2,1.23321E+2]
266; X64_AVX-NEXT:    retq
267entry:
268  %0 = tail call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> <double 4.569870e+02, double 1.233210e+02>, i32 128) nounwind readnone
269  ret <2 x double> %0
270}
271
272declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
273
274define <4 x float> @minss_fold(float* %x, <4 x float> %y) {
275; X32-LABEL: minss_fold:
276; X32:       ## %bb.0: ## %entry
277; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
278; X32-NEXT:    minss (%eax), %xmm0
279; X32-NEXT:    retl
280;
281; X64-LABEL: minss_fold:
282; X64:       ## %bb.0: ## %entry
283; X64-NEXT:    minss (%rdi), %xmm0
284; X64-NEXT:    retq
285;
286; X32_AVX-LABEL: minss_fold:
287; X32_AVX:       ## %bb.0: ## %entry
288; X32_AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
289; X32_AVX-NEXT:    vminss (%eax), %xmm0, %xmm0
290; X32_AVX-NEXT:    retl
291;
292; X64_AVX-LABEL: minss_fold:
293; X64_AVX:       ## %bb.0: ## %entry
294; X64_AVX-NEXT:    vminss (%rdi), %xmm0, %xmm0
295; X64_AVX-NEXT:    retq
296entry:
297  %0 = load float, float* %x, align 1
298  %vecinit.i = insertelement <4 x float> undef, float %0, i32 0
299  %vecinit2.i = insertelement <4 x float> %vecinit.i, float 0.000000e+00, i32 1
300  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float 0.000000e+00, i32 2
301  %vecinit4.i = insertelement <4 x float> %vecinit3.i, float 0.000000e+00, i32 3
302  %1 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %y, <4 x float> %vecinit4.i)
303  ret <4 x float> %1
304}
305
306define <4 x float> @maxss_fold(float* %x, <4 x float> %y) {
307; X32-LABEL: maxss_fold:
308; X32:       ## %bb.0: ## %entry
309; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
310; X32-NEXT:    maxss (%eax), %xmm0
311; X32-NEXT:    retl
312;
313; X64-LABEL: maxss_fold:
314; X64:       ## %bb.0: ## %entry
315; X64-NEXT:    maxss (%rdi), %xmm0
316; X64-NEXT:    retq
317;
318; X32_AVX-LABEL: maxss_fold:
319; X32_AVX:       ## %bb.0: ## %entry
320; X32_AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
321; X32_AVX-NEXT:    vmaxss (%eax), %xmm0, %xmm0
322; X32_AVX-NEXT:    retl
323;
324; X64_AVX-LABEL: maxss_fold:
325; X64_AVX:       ## %bb.0: ## %entry
326; X64_AVX-NEXT:    vmaxss (%rdi), %xmm0, %xmm0
327; X64_AVX-NEXT:    retq
328entry:
329  %0 = load float, float* %x, align 1
330  %vecinit.i = insertelement <4 x float> undef, float %0, i32 0
331  %vecinit2.i = insertelement <4 x float> %vecinit.i, float 0.000000e+00, i32 1
332  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float 0.000000e+00, i32 2
333  %vecinit4.i = insertelement <4 x float> %vecinit3.i, float 0.000000e+00, i32 3
334  %1 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %y, <4 x float> %vecinit4.i)
335  ret <4 x float> %1
336}
337
338define <4 x float> @cmpss_fold(float* %x, <4 x float> %y) {
339; X32-LABEL: cmpss_fold:
340; X32:       ## %bb.0: ## %entry
341; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
342; X32-NEXT:    cmpeqss (%eax), %xmm0
343; X32-NEXT:    retl
344;
345; X64-LABEL: cmpss_fold:
346; X64:       ## %bb.0: ## %entry
347; X64-NEXT:    cmpeqss (%rdi), %xmm0
348; X64-NEXT:    retq
349;
350; X32_AVX-LABEL: cmpss_fold:
351; X32_AVX:       ## %bb.0: ## %entry
352; X32_AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
353; X32_AVX-NEXT:    vcmpeqss (%eax), %xmm0, %xmm0
354; X32_AVX-NEXT:    retl
355;
356; X64_AVX-LABEL: cmpss_fold:
357; X64_AVX:       ## %bb.0: ## %entry
358; X64_AVX-NEXT:    vcmpeqss (%rdi), %xmm0, %xmm0
359; X64_AVX-NEXT:    retq
360entry:
361  %0 = load float, float* %x, align 1
362  %vecinit.i = insertelement <4 x float> undef, float %0, i32 0
363  %vecinit2.i = insertelement <4 x float> %vecinit.i, float 0.000000e+00, i32 1
364  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float 0.000000e+00, i32 2
365  %vecinit4.i = insertelement <4 x float> %vecinit3.i, float 0.000000e+00, i32 3
366  %1 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %y, <4 x float> %vecinit4.i, i8 0)
367  ret <4 x float> %1
368}
369declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
370
371
372define <4 x float> @double_fold(float* %x, <4 x float> %y) {
373; X32-LABEL: double_fold:
374; X32:       ## %bb.0: ## %entry
375; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
376; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
377; X32-NEXT:    movaps %xmm0, %xmm2
378; X32-NEXT:    minss %xmm1, %xmm2
379; X32-NEXT:    maxss %xmm1, %xmm0
380; X32-NEXT:    addps %xmm2, %xmm0
381; X32-NEXT:    retl
382;
383; X64-LABEL: double_fold:
384; X64:       ## %bb.0: ## %entry
385; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
386; X64-NEXT:    movaps %xmm0, %xmm2
387; X64-NEXT:    minss %xmm1, %xmm2
388; X64-NEXT:    maxss %xmm1, %xmm0
389; X64-NEXT:    addps %xmm2, %xmm0
390; X64-NEXT:    retq
391;
392; X32_AVX-LABEL: double_fold:
393; X32_AVX:       ## %bb.0: ## %entry
394; X32_AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
395; X32_AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
396; X32_AVX-NEXT:    vminss %xmm1, %xmm0, %xmm2
397; X32_AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
398; X32_AVX-NEXT:    vaddps %xmm0, %xmm2, %xmm0
399; X32_AVX-NEXT:    retl
400;
401; X64_AVX-LABEL: double_fold:
402; X64_AVX:       ## %bb.0: ## %entry
403; X64_AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
404; X64_AVX-NEXT:    vminss %xmm1, %xmm0, %xmm2
405; X64_AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
406; X64_AVX-NEXT:    vaddps %xmm0, %xmm2, %xmm0
407; X64_AVX-NEXT:    retq
408entry:
409  %0 = load float, float* %x, align 1
410  %vecinit.i = insertelement <4 x float> undef, float %0, i32 0
411  %1 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %y, <4 x float> %vecinit.i)
412  %2 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %y, <4 x float> %vecinit.i)
413  %3 = fadd <4 x float> %1, %2
414  ret <4 x float> %3
415}
416