1; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X32
2; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X64
3
4@g16 = external global i16
5
6define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind {
7; X32-LABEL: pinsrd_1:
8; X32:       ## BB#0:
9; X32-NEXT:    pinsrd $1, {{[0-9]+}}(%esp), %xmm0
10; X32-NEXT:    retl
11;
12; X64-LABEL: pinsrd_1:
13; X64:       ## BB#0:
14; X64-NEXT:    pinsrd $1, %edi, %xmm0
15; X64-NEXT:    retq
16  %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1
17  ret <4 x i32> %tmp1
18}
19
20define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
21; X32-LABEL: pinsrb_1:
22; X32:       ## BB#0:
23; X32-NEXT:    pinsrb $1, {{[0-9]+}}(%esp), %xmm0
24; X32-NEXT:    retl
25;
26; X64-LABEL: pinsrb_1:
27; X64:       ## BB#0:
28; X64-NEXT:    pinsrb $1, %edi, %xmm0
29; X64-NEXT:    retq
30  %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1
31  ret <16 x i8> %tmp1
32}
33
34define <2 x i64> @pmovsxbd_1(i32* %p) nounwind {
35; X32-LABEL: pmovsxbd_1:
36; X32:       ## BB#0: ## %entry
37; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
38; X32-NEXT:    pmovsxbd (%eax), %xmm0
39; X32-NEXT:    retl
40;
41; X64-LABEL: pmovsxbd_1:
42; X64:       ## BB#0: ## %entry
43; X64-NEXT:    pmovsxbd (%rdi), %xmm0
44; X64-NEXT:    retq
45entry:
46	%0 = load i32, i32* %p, align 4
47	%1 = insertelement <4 x i32> undef, i32 %0, i32 0
48	%2 = insertelement <4 x i32> %1, i32 0, i32 1
49	%3 = insertelement <4 x i32> %2, i32 0, i32 2
50	%4 = insertelement <4 x i32> %3, i32 0, i32 3
51	%5 = bitcast <4 x i32> %4 to <16 x i8>
52	%6 = tail call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %5) nounwind readnone
53	%7 = bitcast <4 x i32> %6 to <2 x i64>
54	ret <2 x i64> %7
55}
56
57define <2 x i64> @pmovsxwd_1(i64* %p) nounwind readonly {
58; X32-LABEL: pmovsxwd_1:
59; X32:       ## BB#0: ## %entry
60; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
61; X32-NEXT:    pmovsxwd (%eax), %xmm0
62; X32-NEXT:    retl
63;
64; X64-LABEL: pmovsxwd_1:
65; X64:       ## BB#0: ## %entry
66; X64-NEXT:    pmovsxwd (%rdi), %xmm0
67; X64-NEXT:    retq
68entry:
69	%0 = load i64, i64* %p		; <i64> [#uses=1]
70	%tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0		; <<2 x i64>> [#uses=1]
71	%1 = bitcast <2 x i64> %tmp2 to <8 x i16>		; <<8 x i16>> [#uses=1]
72	%2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone		; <<4 x i32>> [#uses=1]
73	%3 = bitcast <4 x i32> %2 to <2 x i64>		; <<2 x i64>> [#uses=1]
74	ret <2 x i64> %3
75}
76
77define <2 x i64> @pmovzxbq_1() nounwind {
78; X32-LABEL: pmovzxbq_1:
79; X32:       ## BB#0: ## %entry
80; X32-NEXT:    movl L_g16$non_lazy_ptr, %eax
81; X32-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
82; X32-NEXT:    retl
83;
84; X64-LABEL: pmovzxbq_1:
85; X64:       ## BB#0: ## %entry
86; X64-NEXT:    movq _g16@{{.*}}(%rip), %rax
87; X64-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
88; X64-NEXT:    retq
89entry:
90	%0 = load i16, i16* @g16, align 2		; <i16> [#uses=1]
91	%1 = insertelement <8 x i16> undef, i16 %0, i32 0		; <<8 x i16>> [#uses=1]
92	%2 = bitcast <8 x i16> %1 to <16 x i8>		; <<16 x i8>> [#uses=1]
93	%3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone		; <<2 x i64>> [#uses=1]
94	ret <2 x i64> %3
95}
96
97declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
98declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
99declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
100
101define i32 @extractps_1(<4 x float> %v) nounwind {
102; X32-LABEL: extractps_1:
103; X32:       ## BB#0:
104; X32-NEXT:    extractps $3, %xmm0, %eax
105; X32-NEXT:    retl
106;
107; X64-LABEL: extractps_1:
108; X64:       ## BB#0:
109; X64-NEXT:    extractps $3, %xmm0, %eax
110; X64-NEXT:    retq
111  %s = extractelement <4 x float> %v, i32 3
112  %i = bitcast float %s to i32
113  ret i32 %i
114}
115define i32 @extractps_2(<4 x float> %v) nounwind {
116; X32-LABEL: extractps_2:
117; X32:       ## BB#0:
118; X32-NEXT:    extractps $3, %xmm0, %eax
119; X32-NEXT:    retl
120;
121; X64-LABEL: extractps_2:
122; X64:       ## BB#0:
123; X64-NEXT:    extractps $3, %xmm0, %eax
124; X64-NEXT:    retq
125  %t = bitcast <4 x float> %v to <4 x i32>
126  %s = extractelement <4 x i32> %t, i32 3
127  ret i32 %s
128}
129
130
131; The non-store form of extractps puts its result into a GPR.
132; This makes it suitable for an extract from a <4 x float> that
133; is bitcasted to i32, but unsuitable for much of anything else.
134
135define float @ext_1(<4 x float> %v) nounwind {
136; X32-LABEL: ext_1:
137; X32:       ## BB#0:
138; X32-NEXT:    pushl %eax
139; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
140; X32-NEXT:    addss LCPI7_0, %xmm0
141; X32-NEXT:    movss %xmm0, (%esp)
142; X32-NEXT:    flds (%esp)
143; X32-NEXT:    popl %eax
144; X32-NEXT:    retl
145;
146; X64-LABEL: ext_1:
147; X64:       ## BB#0:
148; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
149; X64-NEXT:    addss {{.*}}(%rip), %xmm0
150; X64-NEXT:    retq
151  %s = extractelement <4 x float> %v, i32 3
152  %t = fadd float %s, 1.0
153  ret float %t
154}
155define float @ext_2(<4 x float> %v) nounwind {
156; X32-LABEL: ext_2:
157; X32:       ## BB#0:
158; X32-NEXT:    pushl %eax
159; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
160; X32-NEXT:    movss %xmm0, (%esp)
161; X32-NEXT:    flds (%esp)
162; X32-NEXT:    popl %eax
163; X32-NEXT:    retl
164;
165; X64-LABEL: ext_2:
166; X64:       ## BB#0:
167; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
168; X64-NEXT:    retq
169  %s = extractelement <4 x float> %v, i32 3
170  ret float %s
171}
172define i32 @ext_3(<4 x i32> %v) nounwind {
173; X32-LABEL: ext_3:
174; X32:       ## BB#0:
175; X32-NEXT:    pextrd $3, %xmm0, %eax
176; X32-NEXT:    retl
177;
178; X64-LABEL: ext_3:
179; X64:       ## BB#0:
180; X64-NEXT:    pextrd $3, %xmm0, %eax
181; X64-NEXT:    retq
182  %i = extractelement <4 x i32> %v, i32 3
183  ret i32 %i
184}
185
186define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
187; X32-LABEL: insertps_1:
188; X32:       ## BB#0:
189; X32-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3]
190; X32-NEXT:    retl
191;
192; X64-LABEL: insertps_1:
193; X64:       ## BB#0:
194; X64-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3]
195; X64-NEXT:    retq
196  %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone
197  ret <4 x float> %tmp1
198}
199
200declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
201
202; When optimizing for speed, prefer blendps over insertps even if it means we have to
203; generate a separate movss to load the scalar operand.
204define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind {
205; X32-LABEL: blendps_not_insertps_1:
206; X32:       ## BB#0:
207; X32-NEXT:    movss   {{.*#+}} xmm1
208; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
209; X32-NEXT:    retl
210;
211; X64-LABEL: blendps_not_insertps_1:
212; X64:       ## BB#0:
213; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
214; X64-NEXT:    retq
215  %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
216  ret <4 x float> %tmp1
217}
218
219; When optimizing for size, generate an insertps if there's a load fold opportunity.
220; The difference between i386 and x86-64 ABIs for the float operand means we should
221; generate an insertps for X32 but not for X64!
222define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind {
223; X32-LABEL: insertps_or_blendps:
224; X32:       ## BB#0:
225; X32-NEXT:    insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
226; X32-NEXT:    retl
227;
228; X64-LABEL: insertps_or_blendps:
229; X64:       ## BB#0:
230; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
231; X64-NEXT:    retq
232  %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
233  ret <4 x float> %tmp1
234}
235
236; An insert into the low 32-bits of a vector from the low 32-bits of another vector
237; is always just a blendps because blendps is never more expensive than insertps.
238define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
239; X32-LABEL: blendps_not_insertps_2:
240; X32:       ## BB#0:
241; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
242; X32-NEXT:    retl
243;
244; X64-LABEL: blendps_not_insertps_2:
245; X64:       ## BB#0:
246; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
247; X64-NEXT:    retq
248  %tmp2 = extractelement <4 x float> %t2, i32 0
249  %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
250  ret <4 x float> %tmp1
251}
252
253define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind {
254; X32-LABEL: ptestz_1:
255; X32:       ## BB#0:
256; X32-NEXT:    ptest %xmm1, %xmm0
257; X32-NEXT:    sete %al
258; X32-NEXT:    movzbl %al, %eax
259; X32-NEXT:    retl
260;
261; X64-LABEL: ptestz_1:
262; X64:       ## BB#0:
263; X64-NEXT:    ptest %xmm1, %xmm0
264; X64-NEXT:    sete %al
265; X64-NEXT:    movzbl %al, %eax
266; X64-NEXT:    retq
267  %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
268  ret i32 %tmp1
269}
270
271define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind {
272; X32-LABEL: ptestz_2:
273; X32:       ## BB#0:
274; X32-NEXT:    ptest %xmm1, %xmm0
275; X32-NEXT:    sbbl %eax, %eax
276; X32-NEXT:    andl $1, %eax
277; X32-NEXT:    retl
278;
279; X64-LABEL: ptestz_2:
280; X64:       ## BB#0:
281; X64-NEXT:    ptest %xmm1, %xmm0
282; X64-NEXT:    sbbl %eax, %eax
283; X64-NEXT:    andl $1, %eax
284; X64-NEXT:    retq
285  %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
286  ret i32 %tmp1
287}
288
289define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind {
290; X32-LABEL: ptestz_3:
291; X32:       ## BB#0:
292; X32-NEXT:    ptest %xmm1, %xmm0
293; X32-NEXT:    seta %al
294; X32-NEXT:    movzbl %al, %eax
295; X32-NEXT:    retl
296;
297; X64-LABEL: ptestz_3:
298; X64:       ## BB#0:
299; X64-NEXT:    ptest %xmm1, %xmm0
300; X64-NEXT:    seta %al
301; X64-NEXT:    movzbl %al, %eax
302; X64-NEXT:    retq
303  %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
304  ret i32 %tmp1
305}
306
307
308declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
309declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
310declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
311
312; This used to compile to insertps $0  + insertps $16.  insertps $0 is always
313; pointless.
314define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind  {
315; X32-LABEL: buildvector:
316; X32:       ## BB#0: ## %entry
317; X32-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
318; X32-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
319; X32-NEXT:    addss %xmm1, %xmm0
320; X32-NEXT:    addss %xmm2, %xmm3
321; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
322; X32-NEXT:    retl
323;
324; X64-LABEL: buildvector:
325; X64:       ## BB#0: ## %entry
326; X64-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
327; X64-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
328; X64-NEXT:    addss %xmm1, %xmm0
329; X64-NEXT:    addss %xmm2, %xmm3
330; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
331; X64-NEXT:    retq
332entry:
333  %tmp7 = extractelement <2 x float> %A, i32 0
334  %tmp5 = extractelement <2 x float> %A, i32 1
335  %tmp3 = extractelement <2 x float> %B, i32 0
336  %tmp1 = extractelement <2 x float> %B, i32 1
337  %add.r = fadd float %tmp7, %tmp3
338  %add.i = fadd float %tmp5, %tmp1
339  %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
340  %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
341  ret <2 x float> %tmp9
342}
343
344define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
345; X32-LABEL: insertps_from_shufflevector_1:
346; X32:       ## BB#0: ## %entry
347; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
348; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
349; X32-NEXT:    retl
350;
351; X64-LABEL: insertps_from_shufflevector_1:
352; X64:       ## BB#0: ## %entry
353; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
354; X64-NEXT:    retq
355entry:
356  %0 = load <4 x float>, <4 x float>* %pb, align 16
357  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
358  ret <4 x float> %vecinit6
359}
360
361define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
362; X32-LABEL: insertps_from_shufflevector_2:
363; X32:       ## BB#0: ## %entry
364; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
365; X32-NEXT:    retl
366;
367; X64-LABEL: insertps_from_shufflevector_2:
368; X64:       ## BB#0: ## %entry
369; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
370; X64-NEXT:    retq
371entry:
372  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
373  ret <4 x float> %vecinit6
374}
375
376; For loading an i32 from memory into an xmm register we use pinsrd
377; instead of insertps
378define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
379; X32-LABEL: pinsrd_from_shufflevector_i32:
380; X32:       ## BB#0: ## %entry
381; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
382; X32-NEXT:    pshufd {{.*#+}} xmm1 = mem[0,1,2,0]
383; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
384; X32-NEXT:    retl
385;
386; X64-LABEL: pinsrd_from_shufflevector_i32:
387; X64:       ## BB#0: ## %entry
388; X64-NEXT:    pshufd {{.*#+}} xmm1 = mem[0,1,2,0]
389; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
390; X64-NEXT:    retq
391entry:
392  %0 = load <4 x i32>, <4 x i32>* %pb, align 16
393  %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
394  ret <4 x i32> %vecinit6
395}
396
397define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
398; X32-LABEL: insertps_from_shufflevector_i32_2:
399; X32:       ## BB#0: ## %entry
400; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
401; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
402; X32-NEXT:    retl
403;
404; X64-LABEL: insertps_from_shufflevector_i32_2:
405; X64:       ## BB#0: ## %entry
406; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
407; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
408; X64-NEXT:    retq
409entry:
410  %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
411  ret <4 x i32> %vecinit6
412}
413
414define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
415; X32-LABEL: insertps_from_load_ins_elt_undef:
416; X32:       ## BB#0:
417; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
418; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
419; X32-NEXT:    retl
420;
421; X64-LABEL: insertps_from_load_ins_elt_undef:
422; X64:       ## BB#0:
423; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
424; X64-NEXT:    retq
425  %1 = load float, float* %b, align 4
426  %2 = insertelement <4 x float> undef, float %1, i32 0
427  %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
428  ret <4 x float> %result
429}
430
431; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
432define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
433; X32-LABEL: insertps_from_load_ins_elt_undef_i32:
434; X32:       ## BB#0:
435; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
436; X32-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
437; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
438; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
439; X32-NEXT:    retl
440;
441; X64-LABEL: insertps_from_load_ins_elt_undef_i32:
442; X64:       ## BB#0:
443; X64-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
444; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
445; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
446; X64-NEXT:    retq
447  %1 = load i32, i32* %b, align 4
448  %2 = insertelement <4 x i32> undef, i32 %1, i32 0
449  %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
450  ret <4 x i32> %result
451}
452
453;;;;;; Shuffles optimizable with a single insertps or blend instruction
454define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
455; X32-LABEL: shuf_XYZ0:
456; X32:       ## BB#0:
457; X32-NEXT:    xorps %xmm1, %xmm1
458; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
459; X32-NEXT:    retl
460;
461; X64-LABEL: shuf_XYZ0:
462; X64:       ## BB#0:
463; X64-NEXT:    xorps %xmm1, %xmm1
464; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
465; X64-NEXT:    retq
466  %vecext = extractelement <4 x float> %x, i32 0
467  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
468  %vecext1 = extractelement <4 x float> %x, i32 1
469  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
470  %vecext3 = extractelement <4 x float> %x, i32 2
471  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
472  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
473  ret <4 x float> %vecinit5
474}
475
476define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
477; X32-LABEL: shuf_XY00:
478; X32:       ## BB#0:
479; X32-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
480; X32-NEXT:    retl
481;
482; X64-LABEL: shuf_XY00:
483; X64:       ## BB#0:
484; X64-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
485; X64-NEXT:    retq
486  %vecext = extractelement <4 x float> %x, i32 0
487  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
488  %vecext1 = extractelement <4 x float> %x, i32 1
489  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
490  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
491  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
492  ret <4 x float> %vecinit4
493}
494
495define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
496; X32-LABEL: shuf_XYY0:
497; X32:       ## BB#0:
498; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
499; X32-NEXT:    retl
500;
501; X64-LABEL: shuf_XYY0:
502; X64:       ## BB#0:
503; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
504; X64-NEXT:    retq
505  %vecext = extractelement <4 x float> %x, i32 0
506  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
507  %vecext1 = extractelement <4 x float> %x, i32 1
508  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
509  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2
510  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
511  ret <4 x float> %vecinit5
512}
513
514define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
515; X32-LABEL: shuf_XYW0:
516; X32:       ## BB#0:
517; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
518; X32-NEXT:    retl
519;
520; X64-LABEL: shuf_XYW0:
521; X64:       ## BB#0:
522; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
523; X64-NEXT:    retq
524  %vecext = extractelement <4 x float> %x, i32 0
525  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
526  %vecext1 = extractelement <4 x float> %x, i32 1
527  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
528  %vecext2 = extractelement <4 x float> %x, i32 3
529  %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2
530  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
531  ret <4 x float> %vecinit4
532}
533
534define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
535; X32-LABEL: shuf_W00W:
536; X32:       ## BB#0:
537; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
538; X32-NEXT:    retl
539;
540; X64-LABEL: shuf_W00W:
541; X64:       ## BB#0:
542; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
543; X64-NEXT:    retq
544  %vecext = extractelement <4 x float> %x, i32 3
545  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
546  %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
547  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
548  %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3
549  ret <4 x float> %vecinit4
550}
551
552define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
553; X32-LABEL: shuf_X00A:
554; X32:       ## BB#0:
555; X32-NEXT:    xorps %xmm2, %xmm2
556; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
557; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
558; X32-NEXT:    retl
559;
560; X64-LABEL: shuf_X00A:
561; X64:       ## BB#0:
562; X64-NEXT:    xorps %xmm2, %xmm2
563; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
564; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
565; X64-NEXT:    retq
566  %vecext = extractelement <4 x float> %x, i32 0
567  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
568  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
569  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
570  %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
571  ret <4 x float> %vecinit4
572}
573
574define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
575; X32-LABEL: shuf_X00X:
576; X32:       ## BB#0:
577; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
578; X32-NEXT:    retl
579;
580; X64-LABEL: shuf_X00X:
581; X64:       ## BB#0:
582; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
583; X64-NEXT:    retq
584  %vecext = extractelement <4 x float> %x, i32 0
585  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
586  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
587  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
588  %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
589  ret <4 x float> %vecinit4
590}
591
592define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
593; X32-LABEL: shuf_X0YC:
594; X32:       ## BB#0:
595; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
596; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
597; X32-NEXT:    retl
598;
599; X64-LABEL: shuf_X0YC:
600; X64:       ## BB#0:
601; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
602; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
603; X64-NEXT:    retq
604  %vecext = extractelement <4 x float> %x, i32 0
605  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
606  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
607  %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
608  %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
609  ret <4 x float> %vecinit5
610}
611
612define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
613; X32-LABEL: i32_shuf_XYZ0:
614; X32:       ## BB#0:
615; X32-NEXT:    pxor %xmm1, %xmm1
616; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
617; X32-NEXT:    retl
618;
619; X64-LABEL: i32_shuf_XYZ0:
620; X64:       ## BB#0:
621; X64-NEXT:    pxor %xmm1, %xmm1
622; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
623; X64-NEXT:    retq
624  %vecext = extractelement <4 x i32> %x, i32 0
625  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
626  %vecext1 = extractelement <4 x i32> %x, i32 1
627  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
628  %vecext3 = extractelement <4 x i32> %x, i32 2
629  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
630  %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
631  ret <4 x i32> %vecinit5
632}
633
634define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
635; X32-LABEL: i32_shuf_XY00:
636; X32:       ## BB#0:
637; X32-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
638; X32-NEXT:    retl
639;
640; X64-LABEL: i32_shuf_XY00:
641; X64:       ## BB#0:
642; X64-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
643; X64-NEXT:    retq
644  %vecext = extractelement <4 x i32> %x, i32 0
645  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
646  %vecext1 = extractelement <4 x i32> %x, i32 1
647  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
648  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
649  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
650  ret <4 x i32> %vecinit4
651}
652
653define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
654; X32-LABEL: i32_shuf_XYY0:
655; X32:       ## BB#0:
656; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
657; X32-NEXT:    pxor %xmm0, %xmm0
658; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
659; X32-NEXT:    retl
660;
661; X64-LABEL: i32_shuf_XYY0:
662; X64:       ## BB#0:
663; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
664; X64-NEXT:    pxor %xmm0, %xmm0
665; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
666; X64-NEXT:    retq
667  %vecext = extractelement <4 x i32> %x, i32 0
668  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
669  %vecext1 = extractelement <4 x i32> %x, i32 1
670  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
671  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2
672  %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
673  ret <4 x i32> %vecinit5
674}
675
676define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
677; X32-LABEL: i32_shuf_XYW0:
678; X32:       ## BB#0:
679; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3]
680; X32-NEXT:    pxor %xmm0, %xmm0
681; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
682; X32-NEXT:    retl
683;
684; X64-LABEL: i32_shuf_XYW0:
685; X64:       ## BB#0:
686; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3]
687; X64-NEXT:    pxor %xmm0, %xmm0
688; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
689; X64-NEXT:    retq
690  %vecext = extractelement <4 x i32> %x, i32 0
691  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
692  %vecext1 = extractelement <4 x i32> %x, i32 1
693  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
694  %vecext2 = extractelement <4 x i32> %x, i32 3
695  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2
696  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
697  ret <4 x i32> %vecinit4
698}
699
700define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
701; X32-LABEL: i32_shuf_W00W:
702; X32:       ## BB#0:
703; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
704; X32-NEXT:    pxor %xmm0, %xmm0
705; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
706; X32-NEXT:    retl
707;
708; X64-LABEL: i32_shuf_W00W:
709; X64:       ## BB#0:
710; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
711; X64-NEXT:    pxor %xmm0, %xmm0
712; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
713; X64-NEXT:    retq
714  %vecext = extractelement <4 x i32> %x, i32 3
715  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
716  %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
717  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
718  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3
719  ret <4 x i32> %vecinit4
720}
721
722define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
723; X32-LABEL: i32_shuf_X00A:
724; X32:       ## BB#0:
725; X32-NEXT:    pxor %xmm2, %xmm2
726; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
727; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
728; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
729; X32-NEXT:    retl
730;
731; X64-LABEL: i32_shuf_X00A:
732; X64:       ## BB#0:
733; X64-NEXT:    pxor %xmm2, %xmm2
734; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
735; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
736; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
737; X64-NEXT:    retq
738  %vecext = extractelement <4 x i32> %x, i32 0
739  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
740  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
741  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
742  %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
743  ret <4 x i32> %vecinit4
744}
745
746define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
747; X32-LABEL: i32_shuf_X00X:
748; X32:       ## BB#0:
749; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
750; X32-NEXT:    pxor %xmm0, %xmm0
751; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
752; X32-NEXT:    retl
753;
754; X64-LABEL: i32_shuf_X00X:
755; X64:       ## BB#0:
756; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
757; X64-NEXT:    pxor %xmm0, %xmm0
758; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
759; X64-NEXT:    retq
760  %vecext = extractelement <4 x i32> %x, i32 0
761  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
762  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
763  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
764  %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
765  ret <4 x i32> %vecinit4
766}
767
768define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
769; X32-LABEL: i32_shuf_X0YC:
770; X32:       ## BB#0:
771; X32-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
772; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
773; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
774; X32-NEXT:    retl
775;
776; X64-LABEL: i32_shuf_X0YC:
777; X64:       ## BB#0:
778; X64-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
779; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
780; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
781; X64-NEXT:    retq
782  %vecext = extractelement <4 x i32> %x, i32 0
783  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
784  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
785  %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
786  %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
787  ret <4 x i32> %vecinit5
788}
789
790;; Test for a bug in the first implementation of LowerBuildVectorv4x32
791define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
792; X32-LABEL: test_insertps_no_undef:
793; X32:       ## BB#0:
794; X32-NEXT:    xorps %xmm1, %xmm1
795; X32-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
796; X32-NEXT:    maxps %xmm1, %xmm0
797; X32-NEXT:    retl
798;
799; X64-LABEL: test_insertps_no_undef:
800; X64:       ## BB#0:
801; X64-NEXT:    xorps %xmm1, %xmm1
802; X64-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
803; X64-NEXT:    maxps %xmm1, %xmm0
804; X64-NEXT:    retq
805  %vecext = extractelement <4 x float> %x, i32 0
806  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
807  %vecext1 = extractelement <4 x float> %x, i32 1
808  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
809  %vecext3 = extractelement <4 x float> %x, i32 2
810  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
811  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
812  %mask = fcmp olt <4 x float> %vecinit5, %x
813  %res = select  <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
814  ret <4 x float> %res
815}
816
817define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
818; X32-LABEL: blendvb_fallback:
819; X32:       ## BB#0:
820; X32-NEXT:    psllw $15, %xmm0
821; X32-NEXT:    psraw $15, %xmm0
822; X32-NEXT:    pblendvb %xmm1, %xmm2
823; X32-NEXT:    movdqa %xmm2, %xmm0
824; X32-NEXT:    retl
825;
826; X64-LABEL: blendvb_fallback:
827; X64:       ## BB#0:
828; X64-NEXT:    psllw $15, %xmm0
829; X64-NEXT:    psraw $15, %xmm0
830; X64-NEXT:    pblendvb %xmm1, %xmm2
831; X64-NEXT:    movdqa %xmm2, %xmm0
832; X64-NEXT:    retq
833  %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
834  ret <8 x i16> %ret
835}
836
837; On X32, account for the argument's move to registers
838define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
839; X32-LABEL: insertps_from_vector_load:
840; X32:       ## BB#0:
841; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
842; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
843; X32-NEXT:    retl
844;
845; X64-LABEL: insertps_from_vector_load:
846; X64:       ## BB#0:
847; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
848; X64-NEXT:    retq
849  %1 = load <4 x float>, <4 x float>* %pb, align 16
850  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
851  ret <4 x float> %2
852}
853
854;; Use a non-zero CountS for insertps
855;; Try to match a bit more of the instr, since we need the load's offset.
856define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
857; X32-LABEL: insertps_from_vector_load_offset:
858; X32:       ## BB#0:
859; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
860; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3]
861; X32-NEXT:    retl
862;
863; X64-LABEL: insertps_from_vector_load_offset:
864; X64:       ## BB#0:
865; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3]
866; X64-NEXT:    retq
867  %1 = load <4 x float>, <4 x float>* %pb, align 16
868  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
869  ret <4 x float> %2
870}
871
872;; Try to match a bit more of the instr, since we need the load's offset.
873define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
874; X32-LABEL: insertps_from_vector_load_offset_2:
875; X32:       ## BB#0:
876; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
877; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
878; X32-NEXT:    shll $4, %ecx
879; X32-NEXT:    insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3]
880; X32-NEXT:    retl
881;
882; X64-LABEL: insertps_from_vector_load_offset_2:
883; X64:       ## BB#0:
884; X64-NEXT:    shlq $4, %rsi
885; X64-NEXT:    insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3]
886; X64-NEXT:    retq
887  %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
888  %2 = load <4 x float>, <4 x float>* %1, align 16
889  %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
890  ret <4 x float> %3
891}
892
893define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
894; X32-LABEL: insertps_from_broadcast_loadf32:
895; X32:       ## BB#0:
896; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
897; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
898; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
899; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
900; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
901; X32-NEXT:    retl
902;
903; X64-LABEL: insertps_from_broadcast_loadf32:
904; X64:       ## BB#0:
905; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
906; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
907; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
908; X64-NEXT:    retq
909  %1 = getelementptr inbounds float, float* %fb, i64 %index
910  %2 = load float, float* %1, align 4
911  %3 = insertelement <4 x float> undef, float %2, i32 0
912  %4 = insertelement <4 x float> %3, float %2, i32 1
913  %5 = insertelement <4 x float> %4, float %2, i32 2
914  %6 = insertelement <4 x float> %5, float %2, i32 3
915  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
916  ret <4 x float> %7
917}
918
919define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
920; X32-LABEL: insertps_from_broadcast_loadv4f32:
921; X32:       ## BB#0:
922; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
923; X32-NEXT:    movups (%eax), %xmm1
924; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
925; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
926; X32-NEXT:    retl
927;
928; X64-LABEL: insertps_from_broadcast_loadv4f32:
929; X64:       ## BB#0:
930; X64-NEXT:    movups (%rdi), %xmm1
931; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
932; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
933; X64-NEXT:    retq
934  %1 = load <4 x float>, <4 x float>* %b, align 4
935  %2 = extractelement <4 x float> %1, i32 0
936  %3 = insertelement <4 x float> undef, float %2, i32 0
937  %4 = insertelement <4 x float> %3, float %2, i32 1
938  %5 = insertelement <4 x float> %4, float %2, i32 2
939  %6 = insertelement <4 x float> %5, float %2, i32 3
940  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
941  ret <4 x float> %7
942}
943
944;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
945define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
946; X32-LABEL: insertps_from_broadcast_multiple_use:
947; X32:       ## BB#0:
948; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
949; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
950; X32-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
951; X32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
952; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
953; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
954; X32-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
955; X32-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
956; X32-NEXT:    addps %xmm1, %xmm0
957; X32-NEXT:    addps %xmm2, %xmm3
958; X32-NEXT:    addps %xmm3, %xmm0
959; X32-NEXT:    retl
960;
961; X64-LABEL: insertps_from_broadcast_multiple_use:
962; X64:       ## BB#0:
963; X64-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
964; X64-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
965; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
966; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
967; X64-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
968; X64-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
969; X64-NEXT:    addps %xmm1, %xmm0
970; X64-NEXT:    addps %xmm2, %xmm3
971; X64-NEXT:    addps %xmm3, %xmm0
972; X64-NEXT:    retq
973  %1 = getelementptr inbounds float, float* %fb, i64 %index
974  %2 = load float, float* %1, align 4
975  %3 = insertelement <4 x float> undef, float %2, i32 0
976  %4 = insertelement <4 x float> %3, float %2, i32 1
977  %5 = insertelement <4 x float> %4, float %2, i32 2
978  %6 = insertelement <4 x float> %5, float %2, i32 3
979  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
980  %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
981  %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
982  %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
983  %11 = fadd <4 x float> %7, %8
984  %12 = fadd <4 x float> %9, %10
985  %13 = fadd <4 x float> %11, %12
986  ret <4 x float> %13
987}
988
989define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
990; X32-LABEL: insertps_with_undefs:
991; X32:       ## BB#0:
992; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
993; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
994; X32-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
995; X32-NEXT:    movapd %xmm1, %xmm0
996; X32-NEXT:    retl
997;
998; X64-LABEL: insertps_with_undefs:
999; X64:       ## BB#0:
1000; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1001; X64-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1002; X64-NEXT:    movapd %xmm1, %xmm0
1003; X64-NEXT:    retq
1004  %1 = load float, float* %b, align 4
1005  %2 = insertelement <4 x float> undef, float %1, i32 0
1006  %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7>
1007  ret <4 x float> %result
1008}
1009
1010; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using
1011; the destination index to change the load, instead of the source index.
1012define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
1013; X32-LABEL: pr20087:
1014; X32:       ## BB#0:
1015; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1016; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2]
1017; X32-NEXT:    retl
1018;
1019; X64-LABEL: pr20087:
1020; X64:       ## BB#0:
1021; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2]
1022; X64-NEXT:    retq
1023  %load = load <4 x float> , <4 x float> *%ptr
1024  %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
1025  ret <4 x float> %ret
1026}
1027
1028; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1>
1029define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* noalias nocapture %RET) #1 {
1030; X32-LABEL: insertps_pr20411:
1031; X32:       ## BB#0:
1032; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1033; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1034; X32-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1035; X32-NEXT:    movdqu %xmm1, (%eax)
1036; X32-NEXT:    retl
1037;
1038; X64-LABEL: insertps_pr20411:
1039; X64:       ## BB#0:
1040; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1041; X64-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1042; X64-NEXT:    movdqu %xmm1, (%rdi)
1043; X64-NEXT:    retq
1044  %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 0, i32 7, i32 undef, i32 undef>
1045  %ptrcast = bitcast i32* %RET to <4 x i32>*
1046  store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4
1047  ret void
1048}
1049
1050define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) {
1051; X32-LABEL: insertps_4:
1052; X32:       ## BB#0: ## %entry
1053; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
1054; X32-NEXT:    retl
1055;
1056; X64-LABEL: insertps_4:
1057; X64:       ## BB#0: ## %entry
1058; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
1059; X64-NEXT:    retq
1060entry:
1061  %vecext = extractelement <4 x float> %A, i32 0
1062  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1063  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
1064  %vecext2 = extractelement <4 x float> %B, i32 2
1065  %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
1066  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1067  ret <4 x float> %vecinit4
1068}
1069
1070define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) {
1071; X32-LABEL: insertps_5:
1072; X32:       ## BB#0: ## %entry
1073; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
1074; X32-NEXT:    retl
1075;
1076; X64-LABEL: insertps_5:
1077; X64:       ## BB#0: ## %entry
1078; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
1079; X64-NEXT:    retq
1080entry:
1081  %vecext = extractelement <4 x float> %A, i32 0
1082  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1083  %vecext1 = extractelement <4 x float> %B, i32 1
1084  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
1085  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
1086  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1087  ret <4 x float> %vecinit4
1088}
1089
1090define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) {
1091; X32-LABEL: insertps_6:
1092; X32:       ## BB#0: ## %entry
1093; X32-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
1094; X32-NEXT:    retl
1095;
1096; X64-LABEL: insertps_6:
1097; X64:       ## BB#0: ## %entry
1098; X64-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
1099; X64-NEXT:    retq
1100entry:
1101  %vecext = extractelement <4 x float> %A, i32 1
1102  %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
1103  %vecext1 = extractelement <4 x float> %B, i32 2
1104  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
1105  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
1106  ret <4 x float> %vecinit3
1107}
1108
1109define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) {
1110; X32-LABEL: insertps_7:
1111; X32:       ## BB#0: ## %entry
1112; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
1113; X32-NEXT:    retl
1114;
1115; X64-LABEL: insertps_7:
1116; X64:       ## BB#0: ## %entry
1117; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
1118; X64-NEXT:    retq
1119entry:
1120  %vecext = extractelement <4 x float> %A, i32 0
1121  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1122  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
1123  %vecext2 = extractelement <4 x float> %B, i32 1
1124  %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
1125  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1126  ret <4 x float> %vecinit4
1127}
1128
1129define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) {
1130; X32-LABEL: insertps_8:
1131; X32:       ## BB#0: ## %entry
1132; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1133; X32-NEXT:    retl
1134;
1135; X64-LABEL: insertps_8:
1136; X64:       ## BB#0: ## %entry
1137; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1138; X64-NEXT:    retq
1139entry:
1140  %vecext = extractelement <4 x float> %A, i32 0
1141  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1142  %vecext1 = extractelement <4 x float> %B, i32 0
1143  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
1144  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
1145  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1146  ret <4 x float> %vecinit4
1147}
1148
1149define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) {
1150; X32-LABEL: insertps_9:
1151; X32:       ## BB#0: ## %entry
1152; X32-NEXT:    insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
1153; X32-NEXT:    movaps %xmm1, %xmm0
1154; X32-NEXT:    retl
1155;
1156; X64-LABEL: insertps_9:
1157; X64:       ## BB#0: ## %entry
1158; X64-NEXT:    insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
1159; X64-NEXT:    movaps %xmm1, %xmm0
1160; X64-NEXT:    retq
1161entry:
1162  %vecext = extractelement <4 x float> %A, i32 0
1163  %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
1164  %vecext1 = extractelement <4 x float> %B, i32 2
1165  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
1166  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
1167  ret <4 x float> %vecinit3
1168}
1169
1170define <4 x float> @insertps_10(<4 x float> %A)
1171; X32-LABEL: insertps_10:
1172; X32:       ## BB#0:
1173; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero
1174; X32-NEXT:    retl
1175;
1176; X64-LABEL: insertps_10:
1177; X64:       ## BB#0:
1178; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero
1179; X64-NEXT:    retq
1180{
1181  %vecext = extractelement <4 x float> %A, i32 0
1182  %vecbuild1 = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %vecext, i32 0
1183  %vecbuild2 = insertelement <4 x float> %vecbuild1, float %vecext, i32 2
1184  ret <4 x float> %vecbuild2
1185}
1186
1187define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) {
1188; X32-LABEL: build_vector_to_shuffle_1:
1189; X32:       ## BB#0: ## %entry
1190; X32-NEXT:    xorps %xmm1, %xmm1
1191; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1192; X32-NEXT:    retl
1193;
1194; X64-LABEL: build_vector_to_shuffle_1:
1195; X64:       ## BB#0: ## %entry
1196; X64-NEXT:    xorps %xmm1, %xmm1
1197; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1198; X64-NEXT:    retq
1199entry:
1200  %vecext = extractelement <4 x float> %A, i32 1
1201  %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
1202  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
1203  %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1204  ret <4 x float> %vecinit3
1205}
1206
1207define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) {
1208; X32-LABEL: build_vector_to_shuffle_2:
1209; X32:       ## BB#0: ## %entry
1210; X32-NEXT:    xorps %xmm1, %xmm1
1211; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1212; X32-NEXT:    retl
1213;
1214; X64-LABEL: build_vector_to_shuffle_2:
1215; X64:       ## BB#0: ## %entry
1216; X64-NEXT:    xorps %xmm1, %xmm1
1217; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1218; X64-NEXT:    retq
1219entry:
1220  %vecext = extractelement <4 x float> %A, i32 1
1221  %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
1222  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
1223  ret <4 x float> %vecinit1
1224}
1225