1; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X32
2; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X64
3
4@g16 = external global i16
5
6define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind {
7; X32-LABEL: pinsrd_1:
8; X32:       ## BB#0:
9; X32-NEXT:    pinsrd $1, {{[0-9]+}}(%esp), %xmm0
10; X32-NEXT:    retl
11;
12; X64-LABEL: pinsrd_1:
13; X64:       ## BB#0:
14; X64-NEXT:    pinsrd $1, %edi, %xmm0
15; X64-NEXT:    retq
16  %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1
17  ret <4 x i32> %tmp1
18}
19
20define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
21; X32-LABEL: pinsrb_1:
22; X32:       ## BB#0:
23; X32-NEXT:    pinsrb $1, {{[0-9]+}}(%esp), %xmm0
24; X32-NEXT:    retl
25;
26; X64-LABEL: pinsrb_1:
27; X64:       ## BB#0:
28; X64-NEXT:    pinsrb $1, %edi, %xmm0
29; X64-NEXT:    retq
30  %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1
31  ret <16 x i8> %tmp1
32}
33
34define <2 x i64> @pmovzxbq_1() nounwind {
35; X32-LABEL: pmovzxbq_1:
36; X32:       ## BB#0: ## %entry
37; X32-NEXT:    movl L_g16$non_lazy_ptr, %eax
38; X32-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
39; X32-NEXT:    retl
40;
41; X64-LABEL: pmovzxbq_1:
42; X64:       ## BB#0: ## %entry
43; X64-NEXT:    movq _g16@{{.*}}(%rip), %rax
44; X64-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
45; X64-NEXT:    retq
46entry:
47	%0 = load i16, i16* @g16, align 2		; <i16> [#uses=1]
48	%1 = insertelement <8 x i16> undef, i16 %0, i32 0		; <<8 x i16>> [#uses=1]
49	%2 = bitcast <8 x i16> %1 to <16 x i8>		; <<16 x i8>> [#uses=1]
50	%3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone		; <<2 x i64>> [#uses=1]
51	ret <2 x i64> %3
52}
53
54declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
55
56define i32 @extractps_1(<4 x float> %v) nounwind {
57; X32-LABEL: extractps_1:
58; X32:       ## BB#0:
59; X32-NEXT:    extractps $3, %xmm0, %eax
60; X32-NEXT:    retl
61;
62; X64-LABEL: extractps_1:
63; X64:       ## BB#0:
64; X64-NEXT:    extractps $3, %xmm0, %eax
65; X64-NEXT:    retq
66  %s = extractelement <4 x float> %v, i32 3
67  %i = bitcast float %s to i32
68  ret i32 %i
69}
70define i32 @extractps_2(<4 x float> %v) nounwind {
71; X32-LABEL: extractps_2:
72; X32:       ## BB#0:
73; X32-NEXT:    extractps $3, %xmm0, %eax
74; X32-NEXT:    retl
75;
76; X64-LABEL: extractps_2:
77; X64:       ## BB#0:
78; X64-NEXT:    extractps $3, %xmm0, %eax
79; X64-NEXT:    retq
80  %t = bitcast <4 x float> %v to <4 x i32>
81  %s = extractelement <4 x i32> %t, i32 3
82  ret i32 %s
83}
84
85
86; The non-store form of extractps puts its result into a GPR.
87; This makes it suitable for an extract from a <4 x float> that
88; is bitcasted to i32, but unsuitable for much of anything else.
89
90define float @ext_1(<4 x float> %v) nounwind {
91; X32-LABEL: ext_1:
92; X32:       ## BB#0:
93; X32-NEXT:    pushl %eax
94; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
95; X32-NEXT:    addss LCPI5_0, %xmm0
96; X32-NEXT:    movss %xmm0, (%esp)
97; X32-NEXT:    flds (%esp)
98; X32-NEXT:    popl %eax
99; X32-NEXT:    retl
100;
101; X64-LABEL: ext_1:
102; X64:       ## BB#0:
103; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
104; X64-NEXT:    addss {{.*}}(%rip), %xmm0
105; X64-NEXT:    retq
106  %s = extractelement <4 x float> %v, i32 3
107  %t = fadd float %s, 1.0
108  ret float %t
109}
110define float @ext_2(<4 x float> %v) nounwind {
111; X32-LABEL: ext_2:
112; X32:       ## BB#0:
113; X32-NEXT:    pushl %eax
114; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
115; X32-NEXT:    movss %xmm0, (%esp)
116; X32-NEXT:    flds (%esp)
117; X32-NEXT:    popl %eax
118; X32-NEXT:    retl
119;
120; X64-LABEL: ext_2:
121; X64:       ## BB#0:
122; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
123; X64-NEXT:    retq
124  %s = extractelement <4 x float> %v, i32 3
125  ret float %s
126}
127define i32 @ext_3(<4 x i32> %v) nounwind {
128; X32-LABEL: ext_3:
129; X32:       ## BB#0:
130; X32-NEXT:    pextrd $3, %xmm0, %eax
131; X32-NEXT:    retl
132;
133; X64-LABEL: ext_3:
134; X64:       ## BB#0:
135; X64-NEXT:    pextrd $3, %xmm0, %eax
136; X64-NEXT:    retq
137  %i = extractelement <4 x i32> %v, i32 3
138  ret i32 %i
139}
140
141define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
142; X32-LABEL: insertps_1:
143; X32:       ## BB#0:
144; X32-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm1[0],zero,xmm0[3]
145; X32-NEXT:    retl
146;
147; X64-LABEL: insertps_1:
148; X64:       ## BB#0:
149; X64-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm1[0],zero,xmm0[3]
150; X64-NEXT:    retq
151  %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 21) nounwind readnone
152  ret <4 x float> %tmp1
153}
154
155declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
156
157; When optimizing for speed, prefer blendps over insertps even if it means we have to
158; generate a separate movss to load the scalar operand.
159define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind {
160; X32-LABEL: blendps_not_insertps_1:
161; X32:       ## BB#0:
162; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
163; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
164; X32-NEXT:    retl
165;
166; X64-LABEL: blendps_not_insertps_1:
167; X64:       ## BB#0:
168; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
169; X64-NEXT:    retq
170  %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
171  ret <4 x float> %tmp1
172}
173
174; When optimizing for size, generate an insertps if there's a load fold opportunity.
175; The difference between i386 and x86-64 ABIs for the float operand means we should
176; generate an insertps for X32 but not for X64!
177define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind {
178; X32-LABEL: insertps_or_blendps:
179; X32:       ## BB#0:
180; X32-NEXT:    insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
181; X32-NEXT:    retl
182;
183; X64-LABEL: insertps_or_blendps:
184; X64:       ## BB#0:
185; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
186; X64-NEXT:    retq
187  %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
188  ret <4 x float> %tmp1
189}
190
191; An insert into the low 32-bits of a vector from the low 32-bits of another vector
192; is always just a blendps because blendps is never more expensive than insertps.
193define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
194; X32-LABEL: blendps_not_insertps_2:
195; X32:       ## BB#0:
196; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
197; X32-NEXT:    retl
198;
199; X64-LABEL: blendps_not_insertps_2:
200; X64:       ## BB#0:
201; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
202; X64-NEXT:    retq
203  %tmp2 = extractelement <4 x float> %t2, i32 0
204  %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
205  ret <4 x float> %tmp1
206}
207
208define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind {
209; X32-LABEL: ptestz_1:
210; X32:       ## BB#0:
211; X32-NEXT:    xorl %eax, %eax
212; X32-NEXT:    ptest %xmm1, %xmm0
213; X32-NEXT:    sete %al
214; X32-NEXT:    retl
215;
216; X64-LABEL: ptestz_1:
217; X64:       ## BB#0:
218; X64-NEXT:    xorl %eax, %eax
219; X64-NEXT:    ptest %xmm1, %xmm0
220; X64-NEXT:    sete %al
221; X64-NEXT:    retq
222  %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
223  ret i32 %tmp1
224}
225
226define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind {
227; X32-LABEL: ptestz_2:
228; X32:       ## BB#0:
229; X32-NEXT:    ptest %xmm1, %xmm0
230; X32-NEXT:    sbbl %eax, %eax
231; X32-NEXT:    andl $1, %eax
232; X32-NEXT:    retl
233;
234; X64-LABEL: ptestz_2:
235; X64:       ## BB#0:
236; X64-NEXT:    ptest %xmm1, %xmm0
237; X64-NEXT:    sbbl %eax, %eax
238; X64-NEXT:    andl $1, %eax
239; X64-NEXT:    retq
240  %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
241  ret i32 %tmp1
242}
243
244define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind {
245; X32-LABEL: ptestz_3:
246; X32:       ## BB#0:
247; X32-NEXT:    xorl %eax, %eax
248; X32-NEXT:    ptest %xmm1, %xmm0
249; X32-NEXT:    seta %al
250; X32-NEXT:    retl
251;
252; X64-LABEL: ptestz_3:
253; X64:       ## BB#0:
254; X64-NEXT:    xorl %eax, %eax
255; X64-NEXT:    ptest %xmm1, %xmm0
256; X64-NEXT:    seta %al
257; X64-NEXT:    retq
258  %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
259  ret i32 %tmp1
260}
261
262
263declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
264declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
265declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
266
267; This used to compile to insertps $0  + insertps $16.  insertps $0 is always
268; pointless.
269define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind  {
270; X32-LABEL: buildvector:
271; X32:       ## BB#0: ## %entry
272; X32-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
273; X32-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
274; X32-NEXT:    addss %xmm1, %xmm0
275; X32-NEXT:    addss %xmm2, %xmm3
276; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
277; X32-NEXT:    retl
278;
279; X64-LABEL: buildvector:
280; X64:       ## BB#0: ## %entry
281; X64-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
282; X64-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
283; X64-NEXT:    addss %xmm1, %xmm0
284; X64-NEXT:    addss %xmm2, %xmm3
285; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
286; X64-NEXT:    retq
287entry:
288  %tmp7 = extractelement <2 x float> %A, i32 0
289  %tmp5 = extractelement <2 x float> %A, i32 1
290  %tmp3 = extractelement <2 x float> %B, i32 0
291  %tmp1 = extractelement <2 x float> %B, i32 1
292  %add.r = fadd float %tmp7, %tmp3
293  %add.i = fadd float %tmp5, %tmp1
294  %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
295  %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
296  ret <2 x float> %tmp9
297}
298
299define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
300; X32-LABEL: insertps_from_shufflevector_1:
301; X32:       ## BB#0: ## %entry
302; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
303; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
304; X32-NEXT:    retl
305;
306; X64-LABEL: insertps_from_shufflevector_1:
307; X64:       ## BB#0: ## %entry
308; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
309; X64-NEXT:    retq
310entry:
311  %0 = load <4 x float>, <4 x float>* %pb, align 16
312  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
313  ret <4 x float> %vecinit6
314}
315
316define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
317; X32-LABEL: insertps_from_shufflevector_2:
318; X32:       ## BB#0: ## %entry
319; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
320; X32-NEXT:    retl
321;
322; X64-LABEL: insertps_from_shufflevector_2:
323; X64:       ## BB#0: ## %entry
324; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
325; X64-NEXT:    retq
326entry:
327  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
328  ret <4 x float> %vecinit6
329}
330
331; For loading an i32 from memory into an xmm register we use pinsrd
332; instead of insertps
333define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
334; X32-LABEL: pinsrd_from_shufflevector_i32:
335; X32:       ## BB#0: ## %entry
336; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
337; X32-NEXT:    pshufd {{.*#+}} xmm1 = mem[0,1,2,0]
338; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
339; X32-NEXT:    retl
340;
341; X64-LABEL: pinsrd_from_shufflevector_i32:
342; X64:       ## BB#0: ## %entry
343; X64-NEXT:    pshufd {{.*#+}} xmm1 = mem[0,1,2,0]
344; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
345; X64-NEXT:    retq
346entry:
347  %0 = load <4 x i32>, <4 x i32>* %pb, align 16
348  %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
349  ret <4 x i32> %vecinit6
350}
351
352define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
353; X32-LABEL: insertps_from_shufflevector_i32_2:
354; X32:       ## BB#0: ## %entry
355; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
356; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
357; X32-NEXT:    retl
358;
359; X64-LABEL: insertps_from_shufflevector_i32_2:
360; X64:       ## BB#0: ## %entry
361; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
362; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
363; X64-NEXT:    retq
364entry:
365  %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
366  ret <4 x i32> %vecinit6
367}
368
369define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
370; X32-LABEL: insertps_from_load_ins_elt_undef:
371; X32:       ## BB#0:
372; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
373; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
374; X32-NEXT:    retl
375;
376; X64-LABEL: insertps_from_load_ins_elt_undef:
377; X64:       ## BB#0:
378; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
379; X64-NEXT:    retq
380  %1 = load float, float* %b, align 4
381  %2 = insertelement <4 x float> undef, float %1, i32 0
382  %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
383  ret <4 x float> %result
384}
385
386; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
387define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
388; X32-LABEL: insertps_from_load_ins_elt_undef_i32:
389; X32:       ## BB#0:
390; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
391; X32-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
392; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
393; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
394; X32-NEXT:    retl
395;
396; X64-LABEL: insertps_from_load_ins_elt_undef_i32:
397; X64:       ## BB#0:
398; X64-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
399; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
400; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
401; X64-NEXT:    retq
402  %1 = load i32, i32* %b, align 4
403  %2 = insertelement <4 x i32> undef, i32 %1, i32 0
404  %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
405  ret <4 x i32> %result
406}
407
408;;;;;; Shuffles optimizable with a single insertps or blend instruction
409define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
410; X32-LABEL: shuf_XYZ0:
411; X32:       ## BB#0:
412; X32-NEXT:    xorps %xmm1, %xmm1
413; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
414; X32-NEXT:    retl
415;
416; X64-LABEL: shuf_XYZ0:
417; X64:       ## BB#0:
418; X64-NEXT:    xorps %xmm1, %xmm1
419; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
420; X64-NEXT:    retq
421  %vecext = extractelement <4 x float> %x, i32 0
422  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
423  %vecext1 = extractelement <4 x float> %x, i32 1
424  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
425  %vecext3 = extractelement <4 x float> %x, i32 2
426  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
427  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
428  ret <4 x float> %vecinit5
429}
430
431define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
432; X32-LABEL: shuf_XY00:
433; X32:       ## BB#0:
434; X32-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
435; X32-NEXT:    retl
436;
437; X64-LABEL: shuf_XY00:
438; X64:       ## BB#0:
439; X64-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
440; X64-NEXT:    retq
441  %vecext = extractelement <4 x float> %x, i32 0
442  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
443  %vecext1 = extractelement <4 x float> %x, i32 1
444  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
445  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
446  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
447  ret <4 x float> %vecinit4
448}
449
450define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
451; X32-LABEL: shuf_XYY0:
452; X32:       ## BB#0:
453; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
454; X32-NEXT:    retl
455;
456; X64-LABEL: shuf_XYY0:
457; X64:       ## BB#0:
458; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
459; X64-NEXT:    retq
460  %vecext = extractelement <4 x float> %x, i32 0
461  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
462  %vecext1 = extractelement <4 x float> %x, i32 1
463  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
464  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2
465  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
466  ret <4 x float> %vecinit5
467}
468
469define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
470; X32-LABEL: shuf_XYW0:
471; X32:       ## BB#0:
472; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
473; X32-NEXT:    retl
474;
475; X64-LABEL: shuf_XYW0:
476; X64:       ## BB#0:
477; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
478; X64-NEXT:    retq
479  %vecext = extractelement <4 x float> %x, i32 0
480  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
481  %vecext1 = extractelement <4 x float> %x, i32 1
482  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
483  %vecext2 = extractelement <4 x float> %x, i32 3
484  %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2
485  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
486  ret <4 x float> %vecinit4
487}
488
489define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
490; X32-LABEL: shuf_W00W:
491; X32:       ## BB#0:
492; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
493; X32-NEXT:    retl
494;
495; X64-LABEL: shuf_W00W:
496; X64:       ## BB#0:
497; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
498; X64-NEXT:    retq
499  %vecext = extractelement <4 x float> %x, i32 3
500  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
501  %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
502  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
503  %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3
504  ret <4 x float> %vecinit4
505}
506
507define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
508; X32-LABEL: shuf_X00A:
509; X32:       ## BB#0:
510; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
511; X32-NEXT:    retl
512;
513; X64-LABEL: shuf_X00A:
514; X64:       ## BB#0:
515; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
516; X64-NEXT:    retq
517  %vecext = extractelement <4 x float> %x, i32 0
518  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
519  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
520  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
521  %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
522  ret <4 x float> %vecinit4
523}
524
525define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
526; X32-LABEL: shuf_X00X:
527; X32:       ## BB#0:
528; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
529; X32-NEXT:    retl
530;
531; X64-LABEL: shuf_X00X:
532; X64:       ## BB#0:
533; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
534; X64-NEXT:    retq
535  %vecext = extractelement <4 x float> %x, i32 0
536  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
537  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
538  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
539  %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
540  ret <4 x float> %vecinit4
541}
542
543define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
544; X32-LABEL: shuf_X0YC:
545; X32:       ## BB#0:
546; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
547; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
548; X32-NEXT:    retl
549;
550; X64-LABEL: shuf_X0YC:
551; X64:       ## BB#0:
552; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
553; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
554; X64-NEXT:    retq
555  %vecext = extractelement <4 x float> %x, i32 0
556  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
557  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
558  %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
559  %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
560  ret <4 x float> %vecinit5
561}
562
563define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
564; X32-LABEL: i32_shuf_XYZ0:
565; X32:       ## BB#0:
566; X32-NEXT:    pxor %xmm1, %xmm1
567; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
568; X32-NEXT:    retl
569;
570; X64-LABEL: i32_shuf_XYZ0:
571; X64:       ## BB#0:
572; X64-NEXT:    pxor %xmm1, %xmm1
573; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
574; X64-NEXT:    retq
575  %vecext = extractelement <4 x i32> %x, i32 0
576  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
577  %vecext1 = extractelement <4 x i32> %x, i32 1
578  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
579  %vecext3 = extractelement <4 x i32> %x, i32 2
580  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
581  %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
582  ret <4 x i32> %vecinit5
583}
584
585define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
586; X32-LABEL: i32_shuf_XY00:
587; X32:       ## BB#0:
588; X32-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
589; X32-NEXT:    retl
590;
591; X64-LABEL: i32_shuf_XY00:
592; X64:       ## BB#0:
593; X64-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
594; X64-NEXT:    retq
595  %vecext = extractelement <4 x i32> %x, i32 0
596  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
597  %vecext1 = extractelement <4 x i32> %x, i32 1
598  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
599  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
600  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
601  ret <4 x i32> %vecinit4
602}
603
604define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
605; X32-LABEL: i32_shuf_XYY0:
606; X32:       ## BB#0:
607; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
608; X32-NEXT:    pxor %xmm0, %xmm0
609; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
610; X32-NEXT:    retl
611;
612; X64-LABEL: i32_shuf_XYY0:
613; X64:       ## BB#0:
614; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
615; X64-NEXT:    pxor %xmm0, %xmm0
616; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
617; X64-NEXT:    retq
618  %vecext = extractelement <4 x i32> %x, i32 0
619  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
620  %vecext1 = extractelement <4 x i32> %x, i32 1
621  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
622  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2
623  %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
624  ret <4 x i32> %vecinit5
625}
626
627define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
628; X32-LABEL: i32_shuf_XYW0:
629; X32:       ## BB#0:
630; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3]
631; X32-NEXT:    pxor %xmm0, %xmm0
632; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
633; X32-NEXT:    retl
634;
635; X64-LABEL: i32_shuf_XYW0:
636; X64:       ## BB#0:
637; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3]
638; X64-NEXT:    pxor %xmm0, %xmm0
639; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
640; X64-NEXT:    retq
641  %vecext = extractelement <4 x i32> %x, i32 0
642  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
643  %vecext1 = extractelement <4 x i32> %x, i32 1
644  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
645  %vecext2 = extractelement <4 x i32> %x, i32 3
646  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2
647  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
648  ret <4 x i32> %vecinit4
649}
650
651define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
652; X32-LABEL: i32_shuf_W00W:
653; X32:       ## BB#0:
654; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
655; X32-NEXT:    pxor %xmm0, %xmm0
656; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
657; X32-NEXT:    retl
658;
659; X64-LABEL: i32_shuf_W00W:
660; X64:       ## BB#0:
661; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
662; X64-NEXT:    pxor %xmm0, %xmm0
663; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
664; X64-NEXT:    retq
665  %vecext = extractelement <4 x i32> %x, i32 3
666  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
667  %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
668  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
669  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3
670  ret <4 x i32> %vecinit4
671}
672
673define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
674; X32-LABEL: i32_shuf_X00A:
675; X32:       ## BB#0:
676; X32-NEXT:    pxor %xmm2, %xmm2
677; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
678; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
679; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
680; X32-NEXT:    retl
681;
682; X64-LABEL: i32_shuf_X00A:
683; X64:       ## BB#0:
684; X64-NEXT:    pxor %xmm2, %xmm2
685; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
686; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
687; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
688; X64-NEXT:    retq
689  %vecext = extractelement <4 x i32> %x, i32 0
690  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
691  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
692  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
693  %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
694  ret <4 x i32> %vecinit4
695}
696
697define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
698; X32-LABEL: i32_shuf_X00X:
699; X32:       ## BB#0:
700; X32-NEXT:    pxor %xmm1, %xmm1
701; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
702; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
703; X32-NEXT:    retl
704;
705; X64-LABEL: i32_shuf_X00X:
706; X64:       ## BB#0:
707; X64-NEXT:    pxor %xmm1, %xmm1
708; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
709; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
710; X64-NEXT:    retq
711  %vecext = extractelement <4 x i32> %x, i32 0
712  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
713  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
714  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
715  %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
716  ret <4 x i32> %vecinit4
717}
718
719define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
720; X32-LABEL: i32_shuf_X0YC:
721; X32:       ## BB#0:
722; X32-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
723; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
724; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
725; X32-NEXT:    retl
726;
727; X64-LABEL: i32_shuf_X0YC:
728; X64:       ## BB#0:
729; X64-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
730; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
731; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
732; X64-NEXT:    retq
733  %vecext = extractelement <4 x i32> %x, i32 0
734  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
735  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
736  %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
737  %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
738  ret <4 x i32> %vecinit5
739}
740
741;; Test for a bug in the first implementation of LowerBuildVectorv4x32
742define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
743; X32-LABEL: test_insertps_no_undef:
744; X32:       ## BB#0:
745; X32-NEXT:    xorps %xmm1, %xmm1
746; X32-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
747; X32-NEXT:    maxps %xmm1, %xmm0
748; X32-NEXT:    retl
749;
750; X64-LABEL: test_insertps_no_undef:
751; X64:       ## BB#0:
752; X64-NEXT:    xorps %xmm1, %xmm1
753; X64-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
754; X64-NEXT:    maxps %xmm1, %xmm0
755; X64-NEXT:    retq
756  %vecext = extractelement <4 x float> %x, i32 0
757  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
758  %vecext1 = extractelement <4 x float> %x, i32 1
759  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
760  %vecext3 = extractelement <4 x float> %x, i32 2
761  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
762  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
763  %mask = fcmp olt <4 x float> %vecinit5, %x
764  %res = select  <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
765  ret <4 x float> %res
766}
767
768define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
769; X32-LABEL: blendvb_fallback:
770; X32:       ## BB#0:
771; X32-NEXT:    psllw $15, %xmm0
772; X32-NEXT:    psraw $15, %xmm0
773; X32-NEXT:    pblendvb %xmm1, %xmm2
774; X32-NEXT:    movdqa %xmm2, %xmm0
775; X32-NEXT:    retl
776;
777; X64-LABEL: blendvb_fallback:
778; X64:       ## BB#0:
779; X64-NEXT:    psllw $15, %xmm0
780; X64-NEXT:    psraw $15, %xmm0
781; X64-NEXT:    pblendvb %xmm1, %xmm2
782; X64-NEXT:    movdqa %xmm2, %xmm0
783; X64-NEXT:    retq
784  %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
785  ret <8 x i16> %ret
786}
787
788; On X32, account for the argument's move to registers
789define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
790; X32-LABEL: insertps_from_vector_load:
791; X32:       ## BB#0:
792; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
793; X32-NEXT:    insertps    $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
794; X32-NEXT:    retl
795;
796; X64-LABEL: insertps_from_vector_load:
797; X64:       ## BB#0:
798; X64-NEXT:    insertps    $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
799; X64-NEXT:    retq
800  %1 = load <4 x float>, <4 x float>* %pb, align 16
801  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
802  ret <4 x float> %2
803}
804
805;; Use a non-zero CountS for insertps
806;; Try to match a bit more of the instr, since we need the load's offset.
807define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
808; X32-LABEL: insertps_from_vector_load_offset:
809; X32:       ## BB#0:
810; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
811; X32-NEXT:    insertps    $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
812; X32-NEXT:    retl
813;
814; X64-LABEL: insertps_from_vector_load_offset:
815; X64:       ## BB#0:
816; X64-NEXT:    insertps    $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
817; X64-NEXT:    retq
818  %1 = load <4 x float>, <4 x float>* %pb, align 16
819  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
820  ret <4 x float> %2
821}
822
823;; Try to match a bit more of the instr, since we need the load's offset.
824define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
825; X32-LABEL: insertps_from_vector_load_offset_2:
826; X32:       ## BB#0:
827; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
828; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
829; X32-NEXT:    shll $4, %ecx
830; X32-NEXT:    insertps    $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
831; X32-NEXT:    retl
832;
833; X64-LABEL: insertps_from_vector_load_offset_2:
834; X64:       ## BB#0:
835; X64-NEXT:    shlq $4, %rsi
836; X64-NEXT:    insertps    $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
837; X64-NEXT:    retq
838  %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
839  %2 = load <4 x float>, <4 x float>* %1, align 16
840  %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
841  ret <4 x float> %3
842}
843
844define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
845; X32-LABEL: insertps_from_broadcast_loadf32:
846; X32:       ## BB#0:
847; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
848; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
849; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
850; X32-NEXT:    retl
851;
852; X64-LABEL: insertps_from_broadcast_loadf32:
853; X64:       ## BB#0:
854; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
855; X64-NEXT:    retq
856  %1 = getelementptr inbounds float, float* %fb, i64 %index
857  %2 = load float, float* %1, align 4
858  %3 = insertelement <4 x float> undef, float %2, i32 0
859  %4 = insertelement <4 x float> %3, float %2, i32 1
860  %5 = insertelement <4 x float> %4, float %2, i32 2
861  %6 = insertelement <4 x float> %5, float %2, i32 3
862  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
863  ret <4 x float> %7
864}
865
866define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
867; X32-LABEL: insertps_from_broadcast_loadv4f32:
868; X32:       ## BB#0:
869; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
870; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
871; X32-NEXT:    retl
872;
873; X64-LABEL: insertps_from_broadcast_loadv4f32:
874; X64:       ## BB#0:
875; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
876; X64-NEXT:    retq
877  %1 = load <4 x float>, <4 x float>* %b, align 4
878  %2 = extractelement <4 x float> %1, i32 0
879  %3 = insertelement <4 x float> undef, float %2, i32 0
880  %4 = insertelement <4 x float> %3, float %2, i32 1
881  %5 = insertelement <4 x float> %4, float %2, i32 2
882  %6 = insertelement <4 x float> %5, float %2, i32 3
883  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
884  ret <4 x float> %7
885}
886
887define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
888; X32-LABEL: insertps_from_broadcast_multiple_use:
889; X32:       ## BB#0:
890; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
891; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
892; X32-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
893; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
894; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
895; X32-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
896; X32-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
897; X32-NEXT:    addps %xmm1, %xmm0
898; X32-NEXT:    addps %xmm2, %xmm3
899; X32-NEXT:    addps %xmm3, %xmm0
900; X32-NEXT:    retl
901;
902; X64-LABEL: insertps_from_broadcast_multiple_use:
903; X64:       ## BB#0:
904; X64-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
905; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
906; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
907; X64-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
908; X64-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
909; X64-NEXT:    addps %xmm1, %xmm0
910; X64-NEXT:    addps %xmm2, %xmm3
911; X64-NEXT:    addps %xmm3, %xmm0
912; X64-NEXT:    retq
913  %1 = getelementptr inbounds float, float* %fb, i64 %index
914  %2 = load float, float* %1, align 4
915  %3 = insertelement <4 x float> undef, float %2, i32 0
916  %4 = insertelement <4 x float> %3, float %2, i32 1
917  %5 = insertelement <4 x float> %4, float %2, i32 2
918  %6 = insertelement <4 x float> %5, float %2, i32 3
919  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
920  %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
921  %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
922  %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
923  %11 = fadd <4 x float> %7, %8
924  %12 = fadd <4 x float> %9, %10
925  %13 = fadd <4 x float> %11, %12
926  ret <4 x float> %13
927}
928
929define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
930; X32-LABEL: insertps_with_undefs:
931; X32:       ## BB#0:
932; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
933; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
934; X32-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
935; X32-NEXT:    movapd %xmm1, %xmm0
936; X32-NEXT:    retl
937;
938; X64-LABEL: insertps_with_undefs:
939; X64:       ## BB#0:
940; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
941; X64-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
942; X64-NEXT:    movapd %xmm1, %xmm0
943; X64-NEXT:    retq
944  %1 = load float, float* %b, align 4
945  %2 = insertelement <4 x float> undef, float %1, i32 0
946  %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7>
947  ret <4 x float> %result
948}
949
950; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using
951; the destination index to change the load, instead of the source index.
952define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
953; X32-LABEL: pr20087:
954; X32:       ## BB#0:
955; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
956; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
957; X32-NEXT:    retl
958;
959; X64-LABEL: pr20087:
960; X64:       ## BB#0:
961; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
962; X64-NEXT:    retq
963  %load = load <4 x float> , <4 x float> *%ptr
964  %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
965  ret <4 x float> %ret
966}
967
968; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1>
969define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* noalias nocapture %RET) #1 {
970; X32-LABEL: insertps_pr20411:
971; X32:       ## BB#0:
972; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
973; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
974; X32-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
975; X32-NEXT:    movdqu %xmm1, (%eax)
976; X32-NEXT:    retl
977;
978; X64-LABEL: insertps_pr20411:
979; X64:       ## BB#0:
980; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
981; X64-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
982; X64-NEXT:    movdqu %xmm1, (%rdi)
983; X64-NEXT:    retq
984  %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 0, i32 7, i32 undef, i32 undef>
985  %ptrcast = bitcast i32* %RET to <4 x i32>*
986  store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4
987  ret void
988}
989
990define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) {
991; X32-LABEL: insertps_4:
992; X32:       ## BB#0: ## %entry
993; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
994; X32-NEXT:    retl
995;
996; X64-LABEL: insertps_4:
997; X64:       ## BB#0: ## %entry
998; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
999; X64-NEXT:    retq
1000entry:
1001  %vecext = extractelement <4 x float> %A, i32 0
1002  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1003  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
1004  %vecext2 = extractelement <4 x float> %B, i32 2
1005  %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
1006  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1007  ret <4 x float> %vecinit4
1008}
1009
1010define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) {
1011; X32-LABEL: insertps_5:
1012; X32:       ## BB#0: ## %entry
1013; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
1014; X32-NEXT:    retl
1015;
1016; X64-LABEL: insertps_5:
1017; X64:       ## BB#0: ## %entry
1018; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
1019; X64-NEXT:    retq
1020entry:
1021  %vecext = extractelement <4 x float> %A, i32 0
1022  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1023  %vecext1 = extractelement <4 x float> %B, i32 1
1024  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
1025  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
1026  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1027  ret <4 x float> %vecinit4
1028}
1029
1030define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) {
1031; X32-LABEL: insertps_6:
1032; X32:       ## BB#0: ## %entry
1033; X32-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
1034; X32-NEXT:    retl
1035;
1036; X64-LABEL: insertps_6:
1037; X64:       ## BB#0: ## %entry
1038; X64-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
1039; X64-NEXT:    retq
1040entry:
1041  %vecext = extractelement <4 x float> %A, i32 1
1042  %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
1043  %vecext1 = extractelement <4 x float> %B, i32 2
1044  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
1045  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
1046  ret <4 x float> %vecinit3
1047}
1048
1049define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) {
1050; X32-LABEL: insertps_7:
1051; X32:       ## BB#0: ## %entry
1052; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
1053; X32-NEXT:    retl
1054;
1055; X64-LABEL: insertps_7:
1056; X64:       ## BB#0: ## %entry
1057; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
1058; X64-NEXT:    retq
1059entry:
1060  %vecext = extractelement <4 x float> %A, i32 0
1061  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1062  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
1063  %vecext2 = extractelement <4 x float> %B, i32 1
1064  %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
1065  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1066  ret <4 x float> %vecinit4
1067}
1068
1069define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) {
1070; X32-LABEL: insertps_8:
1071; X32:       ## BB#0: ## %entry
1072; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1073; X32-NEXT:    retl
1074;
1075; X64-LABEL: insertps_8:
1076; X64:       ## BB#0: ## %entry
1077; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1078; X64-NEXT:    retq
1079entry:
1080  %vecext = extractelement <4 x float> %A, i32 0
1081  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1082  %vecext1 = extractelement <4 x float> %B, i32 0
1083  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
1084  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
1085  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1086  ret <4 x float> %vecinit4
1087}
1088
1089define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) {
1090; X32-LABEL: insertps_9:
1091; X32:       ## BB#0: ## %entry
1092; X32-NEXT:    insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
1093; X32-NEXT:    movaps %xmm1, %xmm0
1094; X32-NEXT:    retl
1095;
1096; X64-LABEL: insertps_9:
1097; X64:       ## BB#0: ## %entry
1098; X64-NEXT:    insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
1099; X64-NEXT:    movaps %xmm1, %xmm0
1100; X64-NEXT:    retq
1101entry:
1102  %vecext = extractelement <4 x float> %A, i32 0
1103  %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
1104  %vecext1 = extractelement <4 x float> %B, i32 2
1105  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
1106  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
1107  ret <4 x float> %vecinit3
1108}
1109
1110define <4 x float> @insertps_10(<4 x float> %A)
1111; X32-LABEL: insertps_10:
1112; X32:       ## BB#0:
1113; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero
1114; X32-NEXT:    retl
1115;
1116; X64-LABEL: insertps_10:
1117; X64:       ## BB#0:
1118; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero
1119; X64-NEXT:    retq
1120{
1121  %vecext = extractelement <4 x float> %A, i32 0
1122  %vecbuild1 = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %vecext, i32 0
1123  %vecbuild2 = insertelement <4 x float> %vecbuild1, float %vecext, i32 2
1124  ret <4 x float> %vecbuild2
1125}
1126
1127define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) {
1128; X32-LABEL: build_vector_to_shuffle_1:
1129; X32:       ## BB#0: ## %entry
1130; X32-NEXT:    xorps %xmm1, %xmm1
1131; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1132; X32-NEXT:    retl
1133;
1134; X64-LABEL: build_vector_to_shuffle_1:
1135; X64:       ## BB#0: ## %entry
1136; X64-NEXT:    xorps %xmm1, %xmm1
1137; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1138; X64-NEXT:    retq
1139entry:
1140  %vecext = extractelement <4 x float> %A, i32 1
1141  %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
1142  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
1143  %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1144  ret <4 x float> %vecinit3
1145}
1146
1147define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) {
1148; X32-LABEL: build_vector_to_shuffle_2:
1149; X32:       ## BB#0: ## %entry
1150; X32-NEXT:    xorps %xmm1, %xmm1
1151; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1152; X32-NEXT:    retl
1153;
1154; X64-LABEL: build_vector_to_shuffle_2:
1155; X64:       ## BB#0: ## %entry
1156; X64-NEXT:    xorps %xmm1, %xmm1
1157; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1158; X64-NEXT:    retq
1159entry:
1160  %vecext = extractelement <4 x float> %A, i32 1
1161  %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
1162  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
1163  ret <4 x float> %vecinit1
1164}
1165