1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; Tests for SSE2 and below, without SSE3+.
3; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s
4
5define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
6; CHECK-LABEL: test1:
7; CHECK:       ## BB#0:
8; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
9; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
10; CHECK-NEXT:    movapd (%ecx), %xmm0
11; CHECK-NEXT:    movlpd {{[0-9]+}}(%esp), %xmm0
12; CHECK-NEXT:    movapd %xmm0, (%eax)
13; CHECK-NEXT:    retl
14	%tmp3 = load <2 x double>, <2 x double>* %A, align 16
15	%tmp7 = insertelement <2 x double> undef, double %B, i32 0
16	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
17	store <2 x double> %tmp9, <2 x double>* %r, align 16
18	ret void
19}
20
21define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
22; CHECK-LABEL: test2:
23; CHECK:       ## BB#0:
24; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
25; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
26; CHECK-NEXT:    movapd (%ecx), %xmm0
27; CHECK-NEXT:    movhpd {{[0-9]+}}(%esp), %xmm0
28; CHECK-NEXT:    movapd %xmm0, (%eax)
29; CHECK-NEXT:    retl
30	%tmp3 = load <2 x double>, <2 x double>* %A, align 16
31	%tmp7 = insertelement <2 x double> undef, double %B, i32 0
32	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
33	store <2 x double> %tmp9, <2 x double>* %r, align 16
34	ret void
35}
36
37
38define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind {
39; CHECK-LABEL: test3:
40; CHECK:       ## BB#0:
41; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
42; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
43; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
44; CHECK-NEXT:    movaps (%edx), %xmm0
45; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
46; CHECK-NEXT:    movaps %xmm0, (%eax)
47; CHECK-NEXT:    retl
48	%tmp = load <4 x float>, <4 x float>* %B		; <<4 x float>> [#uses=2]
49	%tmp3 = load <4 x float>, <4 x float>* %A		; <<4 x float>> [#uses=2]
50	%tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0		; <float> [#uses=1]
51	%tmp7 = extractelement <4 x float> %tmp, i32 0		; <float> [#uses=1]
52	%tmp8 = extractelement <4 x float> %tmp3, i32 1		; <float> [#uses=1]
53	%tmp9 = extractelement <4 x float> %tmp, i32 1		; <float> [#uses=1]
54	%tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.1, i32 0		; <<4 x float>> [#uses=1]
55	%tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1		; <<4 x float>> [#uses=1]
56	%tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2		; <<4 x float>> [#uses=1]
57	%tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3		; <<4 x float>> [#uses=1]
58	store <4 x float> %tmp13, <4 x float>* %res
59	ret void
60}
61
62define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
63; CHECK-LABEL: test4:
64; CHECK:       ## BB#0:
65; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
66; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
67; CHECK-NEXT:    movaps %xmm0, (%eax)
68; CHECK-NEXT:    retl
69	%tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
70	store <4 x float> %tmp5, <4 x float>* %res
71	ret void
72}
73
74define <4 x i32> @test5(i8** %ptr) nounwind {
75; CHECK-LABEL: test5:
76; CHECK:       ## BB#0:
77; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
78; CHECK-NEXT:    movl (%eax), %eax
79; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
80; CHECK-NEXT:    pxor %xmm0, %xmm0
81; CHECK-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
82; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
83; CHECK-NEXT:    retl
84	%tmp = load i8*, i8** %ptr		; <i8*> [#uses=1]
85	%tmp.upgrd.1 = bitcast i8* %tmp to float*		; <float*> [#uses=1]
86	%tmp.upgrd.2 = load float, float* %tmp.upgrd.1		; <float> [#uses=1]
87	%tmp.upgrd.3 = insertelement <4 x float> undef, float %tmp.upgrd.2, i32 0		; <<4 x float>> [#uses=1]
88	%tmp9 = insertelement <4 x float> %tmp.upgrd.3, float 0.000000e+00, i32 1		; <<4 x float>> [#uses=1]
89	%tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 2		; <<4 x float>> [#uses=1]
90	%tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 3		; <<4 x float>> [#uses=1]
91	%tmp21 = bitcast <4 x float> %tmp11 to <16 x i8>		; <<16 x i8>> [#uses=1]
92	%tmp22 = shufflevector <16 x i8> %tmp21, <16 x i8> zeroinitializer, <16 x i32> < i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23 >		; <<16 x i8>> [#uses=1]
93	%tmp31 = bitcast <16 x i8> %tmp22 to <8 x i16>		; <<8 x i16>> [#uses=1]
94	%tmp.upgrd.4 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp31, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 >		; <<8 x i16>> [#uses=1]
95	%tmp36 = bitcast <8 x i16> %tmp.upgrd.4 to <4 x i32>		; <<4 x i32>> [#uses=1]
96	ret <4 x i32> %tmp36
97}
98
99define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
100; CHECK-LABEL: test6:
101; CHECK:       ## BB#0:
102; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
103; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
104; CHECK-NEXT:    movaps (%ecx), %xmm0
105; CHECK-NEXT:    movaps %xmm0, (%eax)
106; CHECK-NEXT:    retl
107  %tmp1 = load <4 x float>, <4 x float>* %A            ; <<4 x float>> [#uses=1]
108  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 >          ; <<4 x float>> [#uses=1]
109  store <4 x float> %tmp2, <4 x float>* %res
110  ret void
111}
112
113define void @test7() nounwind {
114; CHECK-LABEL: test7:
115; CHECK:       ## BB#0:
116; CHECK-NEXT:    xorps %xmm0, %xmm0
117; CHECK-NEXT:    movaps %xmm0, 0
118; CHECK-NEXT:    retl
119  bitcast <4 x i32> zeroinitializer to <4 x float>                ; <<4 x float>>:1 [#uses=1]
120  shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer         ; <<4 x float>>:2 [#uses=1]
121  store <4 x float> %2, <4 x float>* null
122  ret void
123}
124
125@x = external global [4 x i32]
126
127define <2 x i64> @test8() nounwind {
128; CHECK-LABEL: test8:
129; CHECK:       ## BB#0:
130; CHECK-NEXT:    movl L_x$non_lazy_ptr, %eax
131; CHECK-NEXT:    movups (%eax), %xmm0
132; CHECK-NEXT:    retl
133	%tmp = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 0)		; <i32> [#uses=1]
134	%tmp3 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 1)		; <i32> [#uses=1]
135	%tmp5 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 2)		; <i32> [#uses=1]
136	%tmp7 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 3)		; <i32> [#uses=1]
137	%tmp.upgrd.1 = insertelement <4 x i32> undef, i32 %tmp, i32 0		; <<4 x i32>> [#uses=1]
138	%tmp13 = insertelement <4 x i32> %tmp.upgrd.1, i32 %tmp3, i32 1		; <<4 x i32>> [#uses=1]
139	%tmp14 = insertelement <4 x i32> %tmp13, i32 %tmp5, i32 2		; <<4 x i32>> [#uses=1]
140	%tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3		; <<4 x i32>> [#uses=1]
141	%tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64>		; <<2 x i64>> [#uses=1]
142	ret <2 x i64> %tmp16
143}
144
145define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind {
146; CHECK-LABEL: test9:
147; CHECK:       ## BB#0:
148; CHECK-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
149; CHECK-NEXT:    retl
150	%tmp = insertelement <4 x float> undef, float %a, i32 0		; <<4 x float>> [#uses=1]
151	%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1		; <<4 x float>> [#uses=1]
152	%tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2		; <<4 x float>> [#uses=1]
153	%tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3		; <<4 x float>> [#uses=1]
154	ret <4 x float> %tmp13
155}
156
157define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind {
158; CHECK-LABEL: test10:
159; CHECK:       ## BB#0:
160; CHECK-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
161; CHECK-NEXT:    retl
162	%tmp = insertelement <4 x float> undef, float %a, i32 0		; <<4 x float>> [#uses=1]
163	%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1		; <<4 x float>> [#uses=1]
164	%tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2		; <<4 x float>> [#uses=1]
165	%tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3		; <<4 x float>> [#uses=1]
166	ret <4 x float> %tmp13
167}
168
169define <2 x double> @test11(double %a, double %b) nounwind {
170; CHECK-LABEL: test11:
171; CHECK:       ## BB#0:
172; CHECK-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
173; CHECK-NEXT:    retl
174	%tmp = insertelement <2 x double> undef, double %a, i32 0		; <<2 x double>> [#uses=1]
175	%tmp7 = insertelement <2 x double> %tmp, double %b, i32 1		; <<2 x double>> [#uses=1]
176	ret <2 x double> %tmp7
177}
178
179define void @test12() nounwind {
180; CHECK-LABEL: test12:
181; CHECK:       ## BB#0:
182; CHECK-NEXT:    movapd 0, %xmm0
183; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
184; CHECK-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
185; CHECK-NEXT:    xorpd %xmm2, %xmm2
186; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
187; CHECK-NEXT:    addps %xmm1, %xmm0
188; CHECK-NEXT:    movaps %xmm0, 0
189; CHECK-NEXT:    retl
190  %tmp1 = load <4 x float>, <4 x float>* null          ; <<4 x float>> [#uses=2]
191  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 >             ; <<4 x float>> [#uses=1]
192  %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >                ; <<4 x float>> [#uses=1]
193  %tmp4 = fadd <4 x float> %tmp2, %tmp3            ; <<4 x float>> [#uses=1]
194  store <4 x float> %tmp4, <4 x float>* null
195  ret void
196}
197
198define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
199; CHECK-LABEL: test13:
200; CHECK:       ## BB#0:
201; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
202; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
203; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
204; CHECK-NEXT:    movaps (%edx), %xmm0
205; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
206; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
207; CHECK-NEXT:    movaps %xmm0, (%eax)
208; CHECK-NEXT:    retl
209  %tmp3 = load <4 x float>, <4 x float>* %B            ; <<4 x float>> [#uses=1]
210  %tmp5 = load <4 x float>, <4 x float>* %C            ; <<4 x float>> [#uses=1]
211  %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 >         ; <<4 x float>> [#uses=1]
212  store <4 x float> %tmp11, <4 x float>* %res
213  ret void
214}
215
216define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
217; CHECK-LABEL: test14:
218; CHECK:       ## BB#0:
219; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
220; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
221; CHECK-NEXT:    movaps (%ecx), %xmm1
222; CHECK-NEXT:    movaps (%eax), %xmm2
223; CHECK-NEXT:    movaps %xmm2, %xmm0
224; CHECK-NEXT:    addps %xmm1, %xmm0
225; CHECK-NEXT:    subps %xmm1, %xmm2
226; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
227; CHECK-NEXT:    retl
228  %tmp = load <4 x float>, <4 x float>* %y             ; <<4 x float>> [#uses=2]
229  %tmp5 = load <4 x float>, <4 x float>* %x            ; <<4 x float>> [#uses=2]
230  %tmp9 = fadd <4 x float> %tmp5, %tmp             ; <<4 x float>> [#uses=1]
231  %tmp21 = fsub <4 x float> %tmp5, %tmp            ; <<4 x float>> [#uses=1]
232  %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 >                ; <<4 x float>> [#uses=1]
233  ret <4 x float> %tmp27
234}
235
236define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind {
237; CHECK-LABEL: test15:
238; CHECK:       ## BB#0: ## %entry
239; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
240; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
241; CHECK-NEXT:    movapd (%ecx), %xmm0
242; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
243; CHECK-NEXT:    retl
244entry:
245  %tmp = load <4 x float>, <4 x float>* %y             ; <<4 x float>> [#uses=1]
246  %tmp3 = load <4 x float>, <4 x float>* %x            ; <<4 x float>> [#uses=1]
247  %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >           ; <<4 x float>> [#uses=1]
248  ret <4 x float> %tmp4
249}
250
251; PR8900
252
253define  <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) {
254; CHECK-LABEL: test16:
255; CHECK:       ## BB#0:
256; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
257; CHECK-NEXT:    movapd 96(%eax), %xmm0
258; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
259; CHECK-NEXT:    retl
260  %i5 = getelementptr inbounds <4 x double>, <4 x double>* %srcA, i32 3
261  %i6 = load <4 x double>, <4 x double>* %i5, align 32
262  %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2>
263  ret <2 x double> %i7
264}
265
266; PR9009
267define fastcc void @test17() nounwind {
268; CHECK-LABEL: test17:
269; CHECK:       ## BB#0: ## %entry
270; CHECK-NEXT:    movaps {{.*#+}} xmm0 = <u,u,32768,32768>
271; CHECK-NEXT:    movaps %xmm0, (%eax)
272; CHECK-NEXT:    retl
273entry:
274  %0 = insertelement <4 x i32> undef, i32 undef, i32 1
275  %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
276  %2 = bitcast <4 x i32> %1 to <4 x float>
277  store <4 x float> %2, <4 x float> * undef
278  ret void
279}
280
281; PR9210
282define <4 x float> @f(<4 x double>) nounwind {
283; CHECK-LABEL: f:
284; CHECK:       ## BB#0: ## %entry
285; CHECK-NEXT:    cvtpd2ps %xmm1, %xmm1
286; CHECK-NEXT:    cvtpd2ps %xmm0, %xmm0
287; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
288; CHECK-NEXT:    retl
289entry:
290 %double2float.i = fptrunc <4 x double> %0 to <4 x float>
291 ret <4 x float> %double2float.i
292}
293
294define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
295; CHECK-LABEL: test_insert_64_zext:
296; CHECK:       ## BB#0:
297; CHECK-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
298; CHECK-NEXT:    retl
299  %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
300  ret <2 x i64> %1
301}
302
303define <4 x i32> @PR19721(<4 x i32> %i) {
304; CHECK-LABEL: PR19721:
305; CHECK:       ## BB#0:
306; CHECK-NEXT:    andps LCPI19_0, %xmm0
307; CHECK-NEXT:    retl
308  %bc = bitcast <4 x i32> %i to i128
309  %insert = and i128 %bc, -4294967296
310  %bc2 = bitcast i128 %insert to <4 x i32>
311  ret <4 x i32> %bc2
312}
313
314define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
315; CHECK-LABEL: test_mul:
316; CHECK:       ## BB#0:
317; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
318; CHECK-NEXT:    pmuludq %xmm1, %xmm0
319; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
320; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
321; CHECK-NEXT:    pmuludq %xmm2, %xmm1
322; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
323; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
324; CHECK-NEXT:    retl
325  %m = mul <4 x i32> %x, %y
326  ret <4 x i32> %m
327}
328