1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
5; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
7;
8; Verify that the DAG combiner correctly folds bitwise operations across
9; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
10; basic and always-safe patterns. Also test that the DAG combiner will combine
11; target-specific shuffle instructions where reasonable.
12
13target triple = "x86_64-unknown-unknown"
14
15declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
16declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
17declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
18
19define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
20; ALL-LABEL: combine_pshufd1:
21; ALL:       # BB#0: # %entry
22; ALL-NEXT:    retq
23entry:
24  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
25  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
26  ret <4 x i32> %c
27}
28
29define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
30; ALL-LABEL: combine_pshufd2:
31; ALL:       # BB#0: # %entry
32; ALL-NEXT:    retq
33entry:
34  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
35  %b.cast = bitcast <4 x i32> %b to <8 x i16>
36  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
37  %c.cast = bitcast <8 x i16> %c to <4 x i32>
38  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
39  ret <4 x i32> %d
40}
41
42define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
43; ALL-LABEL: combine_pshufd3:
44; ALL:       # BB#0: # %entry
45; ALL-NEXT:    retq
46entry:
47  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
48  %b.cast = bitcast <4 x i32> %b to <8 x i16>
49  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
50  %c.cast = bitcast <8 x i16> %c to <4 x i32>
51  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
52  ret <4 x i32> %d
53}
54
55define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
56; SSE-LABEL: combine_pshufd4:
57; SSE:       # BB#0: # %entry
58; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
59; SSE-NEXT:    retq
60;
61; AVX-LABEL: combine_pshufd4:
62; AVX:       # BB#0: # %entry
63; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
64; AVX-NEXT:    retq
65entry:
66  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31)
67  %b.cast = bitcast <4 x i32> %b to <8 x i16>
68  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
69  %c.cast = bitcast <8 x i16> %c to <4 x i32>
70  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31)
71  ret <4 x i32> %d
72}
73
74define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
75; SSE-LABEL: combine_pshufd5:
76; SSE:       # BB#0: # %entry
77; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
78; SSE-NEXT:    retq
79;
80; AVX-LABEL: combine_pshufd5:
81; AVX:       # BB#0: # %entry
82; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
83; AVX-NEXT:    retq
84entry:
85  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76)
86  %b.cast = bitcast <4 x i32> %b to <8 x i16>
87  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
88  %c.cast = bitcast <8 x i16> %c to <4 x i32>
89  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76)
90  ret <4 x i32> %d
91}
92
93define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
94; SSE-LABEL: combine_pshufd6:
95; SSE:       # BB#0: # %entry
96; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
97; SSE-NEXT:    retq
98;
99; AVX1-LABEL: combine_pshufd6:
100; AVX1:       # BB#0: # %entry
101; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
102; AVX1-NEXT:    retq
103;
104; AVX2-LABEL: combine_pshufd6:
105; AVX2:       # BB#0: # %entry
106; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
107; AVX2-NEXT:    retq
108entry:
109  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
110  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
111  ret <4 x i32> %c
112}
113
114define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
115; ALL-LABEL: combine_pshuflw1:
116; ALL:       # BB#0: # %entry
117; ALL-NEXT:    retq
118entry:
119  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
120  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
121  ret <8 x i16> %c
122}
123
124define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
125; ALL-LABEL: combine_pshuflw2:
126; ALL:       # BB#0: # %entry
127; ALL-NEXT:    retq
128entry:
129  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
130  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
131  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
132  ret <8 x i16> %d
133}
134
135define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
136; SSE-LABEL: combine_pshuflw3:
137; SSE:       # BB#0: # %entry
138; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
139; SSE-NEXT:    retq
140;
141; AVX-LABEL: combine_pshuflw3:
142; AVX:       # BB#0: # %entry
143; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
144; AVX-NEXT:    retq
145entry:
146  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
147  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
148  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
149  ret <8 x i16> %d
150}
151
152define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
153; SSE-LABEL: combine_pshufhw1:
154; SSE:       # BB#0: # %entry
155; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
156; SSE-NEXT:    retq
157;
158; AVX-LABEL: combine_pshufhw1:
159; AVX:       # BB#0: # %entry
160; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
161; AVX-NEXT:    retq
162entry:
163  %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
164  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
165  %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
166  ret <8 x i16> %d
167}
168
169define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
170; SSE-LABEL: combine_bitwise_ops_test1:
171; SSE:       # BB#0:
172; SSE-NEXT:    pand %xmm1, %xmm0
173; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
174; SSE-NEXT:    retq
175;
176; AVX-LABEL: combine_bitwise_ops_test1:
177; AVX:       # BB#0:
178; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
179; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
180; AVX-NEXT:    retq
181  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
182  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
183  %and = and <4 x i32> %shuf1, %shuf2
184  ret <4 x i32> %and
185}
186
187define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
188; SSE-LABEL: combine_bitwise_ops_test2:
189; SSE:       # BB#0:
190; SSE-NEXT:    por %xmm1, %xmm0
191; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
192; SSE-NEXT:    retq
193;
194; AVX-LABEL: combine_bitwise_ops_test2:
195; AVX:       # BB#0:
196; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
197; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
198; AVX-NEXT:    retq
199  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
200  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
201  %or = or <4 x i32> %shuf1, %shuf2
202  ret <4 x i32> %or
203}
204
205define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
206; SSE-LABEL: combine_bitwise_ops_test3:
207; SSE:       # BB#0:
208; SSE-NEXT:    pxor %xmm1, %xmm0
209; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
210; SSE-NEXT:    retq
211;
212; AVX-LABEL: combine_bitwise_ops_test3:
213; AVX:       # BB#0:
214; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
215; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
216; AVX-NEXT:    retq
217  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
218  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
219  %xor = xor <4 x i32> %shuf1, %shuf2
220  ret <4 x i32> %xor
221}
222
223define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
224; SSE-LABEL: combine_bitwise_ops_test4:
225; SSE:       # BB#0:
226; SSE-NEXT:    pand %xmm1, %xmm0
227; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
228; SSE-NEXT:    retq
229;
230; AVX-LABEL: combine_bitwise_ops_test4:
231; AVX:       # BB#0:
232; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
233; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
234; AVX-NEXT:    retq
235  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
236  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
237  %and = and <4 x i32> %shuf1, %shuf2
238  ret <4 x i32> %and
239}
240
241define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
242; SSE-LABEL: combine_bitwise_ops_test5:
243; SSE:       # BB#0:
244; SSE-NEXT:    por %xmm1, %xmm0
245; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
246; SSE-NEXT:    retq
247;
248; AVX-LABEL: combine_bitwise_ops_test5:
249; AVX:       # BB#0:
250; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
251; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
252; AVX-NEXT:    retq
253  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
254  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
255  %or = or <4 x i32> %shuf1, %shuf2
256  ret <4 x i32> %or
257}
258
259define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
260; SSE-LABEL: combine_bitwise_ops_test6:
261; SSE:       # BB#0:
262; SSE-NEXT:    pxor %xmm1, %xmm0
263; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
264; SSE-NEXT:    retq
265;
266; AVX-LABEL: combine_bitwise_ops_test6:
267; AVX:       # BB#0:
268; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
269; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
270; AVX-NEXT:    retq
271  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
272  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
273  %xor = xor <4 x i32> %shuf1, %shuf2
274  ret <4 x i32> %xor
275}
276
277
278; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
279; are not performing a swizzle operations.
280
281define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
282; SSE2-LABEL: combine_bitwise_ops_test1b:
283; SSE2:       # BB#0:
284; SSE2-NEXT:    pand %xmm1, %xmm0
285; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
286; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
287; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
288; SSE2-NEXT:    retq
289;
290; SSSE3-LABEL: combine_bitwise_ops_test1b:
291; SSSE3:       # BB#0:
292; SSSE3-NEXT:    pand %xmm1, %xmm0
293; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
294; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
295; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
296; SSSE3-NEXT:    retq
297;
298; SSE41-LABEL: combine_bitwise_ops_test1b:
299; SSE41:       # BB#0:
300; SSE41-NEXT:    pand %xmm1, %xmm0
301; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
302; SSE41-NEXT:    retq
303;
304; AVX1-LABEL: combine_bitwise_ops_test1b:
305; AVX1:       # BB#0:
306; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
307; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
308; AVX1-NEXT:    retq
309;
310; AVX2-LABEL: combine_bitwise_ops_test1b:
311; AVX2:       # BB#0:
312; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
313; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
314; AVX2-NEXT:    retq
315  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
316  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
317  %and = and <4 x i32> %shuf1, %shuf2
318  ret <4 x i32> %and
319}
320
321define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
322; SSE2-LABEL: combine_bitwise_ops_test2b:
323; SSE2:       # BB#0:
324; SSE2-NEXT:    por %xmm1, %xmm0
325; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
326; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
327; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
328; SSE2-NEXT:    retq
329;
330; SSSE3-LABEL: combine_bitwise_ops_test2b:
331; SSSE3:       # BB#0:
332; SSSE3-NEXT:    por %xmm1, %xmm0
333; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
334; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
335; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
336; SSSE3-NEXT:    retq
337;
338; SSE41-LABEL: combine_bitwise_ops_test2b:
339; SSE41:       # BB#0:
340; SSE41-NEXT:    por %xmm1, %xmm0
341; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
342; SSE41-NEXT:    retq
343;
344; AVX1-LABEL: combine_bitwise_ops_test2b:
345; AVX1:       # BB#0:
346; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
347; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
348; AVX1-NEXT:    retq
349;
350; AVX2-LABEL: combine_bitwise_ops_test2b:
351; AVX2:       # BB#0:
352; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
353; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
354; AVX2-NEXT:    retq
355  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
356  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
357  %or = or <4 x i32> %shuf1, %shuf2
358  ret <4 x i32> %or
359}
360
361define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
362; SSE2-LABEL: combine_bitwise_ops_test3b:
363; SSE2:       # BB#0:
364; SSE2-NEXT:    xorps %xmm1, %xmm0
365; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
366; SSE2-NEXT:    retq
367;
368; SSSE3-LABEL: combine_bitwise_ops_test3b:
369; SSSE3:       # BB#0:
370; SSSE3-NEXT:    xorps %xmm1, %xmm0
371; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
372; SSSE3-NEXT:    retq
373;
374; SSE41-LABEL: combine_bitwise_ops_test3b:
375; SSE41:       # BB#0:
376; SSE41-NEXT:    pxor %xmm1, %xmm0
377; SSE41-NEXT:    pxor %xmm1, %xmm1
378; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
379; SSE41-NEXT:    retq
380;
381; AVX1-LABEL: combine_bitwise_ops_test3b:
382; AVX1:       # BB#0:
383; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
384; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
385; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
386; AVX1-NEXT:    retq
387;
388; AVX2-LABEL: combine_bitwise_ops_test3b:
389; AVX2:       # BB#0:
390; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
391; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
392; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
393; AVX2-NEXT:    retq
394  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
395  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
396  %xor = xor <4 x i32> %shuf1, %shuf2
397  ret <4 x i32> %xor
398}
399
400define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
401; SSE2-LABEL: combine_bitwise_ops_test4b:
402; SSE2:       # BB#0:
403; SSE2-NEXT:    pand %xmm1, %xmm0
404; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
405; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
406; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
407; SSE2-NEXT:    retq
408;
409; SSSE3-LABEL: combine_bitwise_ops_test4b:
410; SSSE3:       # BB#0:
411; SSSE3-NEXT:    pand %xmm1, %xmm0
412; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
413; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
414; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
415; SSSE3-NEXT:    retq
416;
417; SSE41-LABEL: combine_bitwise_ops_test4b:
418; SSE41:       # BB#0:
419; SSE41-NEXT:    pand %xmm1, %xmm0
420; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
421; SSE41-NEXT:    retq
422;
423; AVX1-LABEL: combine_bitwise_ops_test4b:
424; AVX1:       # BB#0:
425; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
426; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
427; AVX1-NEXT:    retq
428;
429; AVX2-LABEL: combine_bitwise_ops_test4b:
430; AVX2:       # BB#0:
431; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
432; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
433; AVX2-NEXT:    retq
434  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
435  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
436  %and = and <4 x i32> %shuf1, %shuf2
437  ret <4 x i32> %and
438}
439
440define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
441; SSE2-LABEL: combine_bitwise_ops_test5b:
442; SSE2:       # BB#0:
443; SSE2-NEXT:    por %xmm1, %xmm0
444; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
445; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
446; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
447; SSE2-NEXT:    retq
448;
449; SSSE3-LABEL: combine_bitwise_ops_test5b:
450; SSSE3:       # BB#0:
451; SSSE3-NEXT:    por %xmm1, %xmm0
452; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
453; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
454; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
455; SSSE3-NEXT:    retq
456;
457; SSE41-LABEL: combine_bitwise_ops_test5b:
458; SSE41:       # BB#0:
459; SSE41-NEXT:    por %xmm1, %xmm0
460; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
461; SSE41-NEXT:    retq
462;
463; AVX1-LABEL: combine_bitwise_ops_test5b:
464; AVX1:       # BB#0:
465; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
466; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
467; AVX1-NEXT:    retq
468;
469; AVX2-LABEL: combine_bitwise_ops_test5b:
470; AVX2:       # BB#0:
471; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
472; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
473; AVX2-NEXT:    retq
474  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
475  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
476  %or = or <4 x i32> %shuf1, %shuf2
477  ret <4 x i32> %or
478}
479
480define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
481; SSE2-LABEL: combine_bitwise_ops_test6b:
482; SSE2:       # BB#0:
483; SSE2-NEXT:    xorps %xmm1, %xmm0
484; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
485; SSE2-NEXT:    retq
486;
487; SSSE3-LABEL: combine_bitwise_ops_test6b:
488; SSSE3:       # BB#0:
489; SSSE3-NEXT:    xorps %xmm1, %xmm0
490; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
491; SSSE3-NEXT:    retq
492;
493; SSE41-LABEL: combine_bitwise_ops_test6b:
494; SSE41:       # BB#0:
495; SSE41-NEXT:    pxor %xmm1, %xmm0
496; SSE41-NEXT:    pxor %xmm1, %xmm1
497; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
498; SSE41-NEXT:    retq
499;
500; AVX1-LABEL: combine_bitwise_ops_test6b:
501; AVX1:       # BB#0:
502; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
503; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
504; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
505; AVX1-NEXT:    retq
506;
507; AVX2-LABEL: combine_bitwise_ops_test6b:
508; AVX2:       # BB#0:
509; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
510; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
511; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
512; AVX2-NEXT:    retq
513  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
514  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
515  %xor = xor <4 x i32> %shuf1, %shuf2
516  ret <4 x i32> %xor
517}
518
519define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
520; SSE2-LABEL: combine_bitwise_ops_test1c:
521; SSE2:       # BB#0:
522; SSE2-NEXT:    pand %xmm1, %xmm0
523; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
524; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
525; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
526; SSE2-NEXT:    retq
527;
528; SSSE3-LABEL: combine_bitwise_ops_test1c:
529; SSSE3:       # BB#0:
530; SSSE3-NEXT:    pand %xmm1, %xmm0
531; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
532; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
533; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
534; SSSE3-NEXT:    retq
535;
536; SSE41-LABEL: combine_bitwise_ops_test1c:
537; SSE41:       # BB#0:
538; SSE41-NEXT:    pand %xmm1, %xmm0
539; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
540; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
541; SSE41-NEXT:    retq
542;
543; AVX1-LABEL: combine_bitwise_ops_test1c:
544; AVX1:       # BB#0:
545; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
546; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
547; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
548; AVX1-NEXT:    retq
549;
550; AVX2-LABEL: combine_bitwise_ops_test1c:
551; AVX2:       # BB#0:
552; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
553; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
554; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
555; AVX2-NEXT:    retq
556  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
557  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
558  %and = and <4 x i32> %shuf1, %shuf2
559  ret <4 x i32> %and
560}
561
562define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
563; SSE2-LABEL: combine_bitwise_ops_test2c:
564; SSE2:       # BB#0:
565; SSE2-NEXT:    por %xmm1, %xmm0
566; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
567; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
568; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
569; SSE2-NEXT:    retq
570;
571; SSSE3-LABEL: combine_bitwise_ops_test2c:
572; SSSE3:       # BB#0:
573; SSSE3-NEXT:    por %xmm1, %xmm0
574; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
575; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
576; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
577; SSSE3-NEXT:    retq
578;
579; SSE41-LABEL: combine_bitwise_ops_test2c:
580; SSE41:       # BB#0:
581; SSE41-NEXT:    por %xmm1, %xmm0
582; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
583; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
584; SSE41-NEXT:    retq
585;
586; AVX1-LABEL: combine_bitwise_ops_test2c:
587; AVX1:       # BB#0:
588; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
589; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
590; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
591; AVX1-NEXT:    retq
592;
593; AVX2-LABEL: combine_bitwise_ops_test2c:
594; AVX2:       # BB#0:
595; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
596; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
597; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
598; AVX2-NEXT:    retq
599  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
600  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
601  %or = or <4 x i32> %shuf1, %shuf2
602  ret <4 x i32> %or
603}
604
605define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
606; SSE2-LABEL: combine_bitwise_ops_test3c:
607; SSE2:       # BB#0:
608; SSE2-NEXT:    pxor %xmm1, %xmm0
609; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
610; SSE2-NEXT:    pxor %xmm1, %xmm1
611; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
612; SSE2-NEXT:    retq
613;
614; SSSE3-LABEL: combine_bitwise_ops_test3c:
615; SSSE3:       # BB#0:
616; SSSE3-NEXT:    pxor %xmm1, %xmm0
617; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
618; SSSE3-NEXT:    pxor %xmm1, %xmm1
619; SSSE3-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
620; SSSE3-NEXT:    retq
621;
622; SSE41-LABEL: combine_bitwise_ops_test3c:
623; SSE41:       # BB#0:
624; SSE41-NEXT:    pxor %xmm1, %xmm0
625; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
626; SSE41-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
627; SSE41-NEXT:    retq
628;
629; AVX-LABEL: combine_bitwise_ops_test3c:
630; AVX:       # BB#0:
631; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
632; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
633; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
634; AVX-NEXT:    retq
635  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
636  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
637  %xor = xor <4 x i32> %shuf1, %shuf2
638  ret <4 x i32> %xor
639}
640
641define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
642; SSE2-LABEL: combine_bitwise_ops_test4c:
643; SSE2:       # BB#0:
644; SSE2-NEXT:    pand %xmm1, %xmm0
645; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
646; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
647; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
648; SSE2-NEXT:    retq
649;
650; SSSE3-LABEL: combine_bitwise_ops_test4c:
651; SSSE3:       # BB#0:
652; SSSE3-NEXT:    pand %xmm1, %xmm0
653; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
654; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
655; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
656; SSSE3-NEXT:    retq
657;
658; SSE41-LABEL: combine_bitwise_ops_test4c:
659; SSE41:       # BB#0:
660; SSE41-NEXT:    pand %xmm1, %xmm0
661; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
662; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
663; SSE41-NEXT:    retq
664;
665; AVX1-LABEL: combine_bitwise_ops_test4c:
666; AVX1:       # BB#0:
667; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
668; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
669; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
670; AVX1-NEXT:    retq
671;
672; AVX2-LABEL: combine_bitwise_ops_test4c:
673; AVX2:       # BB#0:
674; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
675; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
676; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
677; AVX2-NEXT:    retq
678  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
679  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
680  %and = and <4 x i32> %shuf1, %shuf2
681  ret <4 x i32> %and
682}
683
684define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
685; SSE2-LABEL: combine_bitwise_ops_test5c:
686; SSE2:       # BB#0:
687; SSE2-NEXT:    por %xmm1, %xmm0
688; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
689; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
690; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
691; SSE2-NEXT:    retq
692;
693; SSSE3-LABEL: combine_bitwise_ops_test5c:
694; SSSE3:       # BB#0:
695; SSSE3-NEXT:    por %xmm1, %xmm0
696; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
697; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
698; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
699; SSSE3-NEXT:    retq
700;
701; SSE41-LABEL: combine_bitwise_ops_test5c:
702; SSE41:       # BB#0:
703; SSE41-NEXT:    por %xmm1, %xmm0
704; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
705; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
706; SSE41-NEXT:    retq
707;
708; AVX1-LABEL: combine_bitwise_ops_test5c:
709; AVX1:       # BB#0:
710; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
711; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
712; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
713; AVX1-NEXT:    retq
714;
715; AVX2-LABEL: combine_bitwise_ops_test5c:
716; AVX2:       # BB#0:
717; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
718; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
719; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
720; AVX2-NEXT:    retq
721  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
722  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
723  %or = or <4 x i32> %shuf1, %shuf2
724  ret <4 x i32> %or
725}
726
727define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
728; SSE2-LABEL: combine_bitwise_ops_test6c:
729; SSE2:       # BB#0:
730; SSE2-NEXT:    pxor %xmm1, %xmm0
731; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
732; SSE2-NEXT:    pxor %xmm0, %xmm0
733; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
734; SSE2-NEXT:    retq
735;
736; SSSE3-LABEL: combine_bitwise_ops_test6c:
737; SSSE3:       # BB#0:
738; SSSE3-NEXT:    pxor %xmm1, %xmm0
739; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
740; SSSE3-NEXT:    pxor %xmm0, %xmm0
741; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
742; SSSE3-NEXT:    retq
743;
744; SSE41-LABEL: combine_bitwise_ops_test6c:
745; SSE41:       # BB#0:
746; SSE41-NEXT:    pxor %xmm1, %xmm0
747; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
748; SSE41-NEXT:    pxor %xmm0, %xmm0
749; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
750; SSE41-NEXT:    retq
751;
752; AVX1-LABEL: combine_bitwise_ops_test6c:
753; AVX1:       # BB#0:
754; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
755; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
756; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
757; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
758; AVX1-NEXT:    retq
759;
760; AVX2-LABEL: combine_bitwise_ops_test6c:
761; AVX2:       # BB#0:
762; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
763; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
764; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
765; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
766; AVX2-NEXT:    retq
767  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
768  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
769  %xor = xor <4 x i32> %shuf1, %shuf2
770  ret <4 x i32> %xor
771}
772
773define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) {
774; SSE-LABEL: combine_nested_undef_test1:
775; SSE:       # BB#0:
776; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
777; SSE-NEXT:    retq
778;
779; AVX-LABEL: combine_nested_undef_test1:
780; AVX:       # BB#0:
781; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
782; AVX-NEXT:    retq
783  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
784  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
785  ret <4 x i32> %2
786}
787
788define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) {
789; SSE-LABEL: combine_nested_undef_test2:
790; SSE:       # BB#0:
791; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
792; SSE-NEXT:    retq
793;
794; AVX-LABEL: combine_nested_undef_test2:
795; AVX:       # BB#0:
796; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
797; AVX-NEXT:    retq
798  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
799  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
800  ret <4 x i32> %2
801}
802
803define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) {
804; SSE-LABEL: combine_nested_undef_test3:
805; SSE:       # BB#0:
806; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
807; SSE-NEXT:    retq
808;
809; AVX-LABEL: combine_nested_undef_test3:
810; AVX:       # BB#0:
811; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
812; AVX-NEXT:    retq
813  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
814  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
815  ret <4 x i32> %2
816}
817
818define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) {
819; SSE-LABEL: combine_nested_undef_test4:
820; SSE:       # BB#0:
821; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
822; SSE-NEXT:    retq
823;
824; AVX1-LABEL: combine_nested_undef_test4:
825; AVX1:       # BB#0:
826; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
827; AVX1-NEXT:    retq
828;
829; AVX2-LABEL: combine_nested_undef_test4:
830; AVX2:       # BB#0:
831; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
832; AVX2-NEXT:    retq
833  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
834  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
835  ret <4 x i32> %2
836}
837
838define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) {
839; SSE-LABEL: combine_nested_undef_test5:
840; SSE:       # BB#0:
841; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
842; SSE-NEXT:    retq
843;
844; AVX-LABEL: combine_nested_undef_test5:
845; AVX:       # BB#0:
846; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
847; AVX-NEXT:    retq
848  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
849  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
850  ret <4 x i32> %2
851}
852
853define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) {
854; SSE-LABEL: combine_nested_undef_test6:
855; SSE:       # BB#0:
856; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
857; SSE-NEXT:    retq
858;
859; AVX-LABEL: combine_nested_undef_test6:
860; AVX:       # BB#0:
861; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
862; AVX-NEXT:    retq
863  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
864  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
865  ret <4 x i32> %2
866}
867
868define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) {
869; SSE-LABEL: combine_nested_undef_test7:
870; SSE:       # BB#0:
871; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
872; SSE-NEXT:    retq
873;
874; AVX-LABEL: combine_nested_undef_test7:
875; AVX:       # BB#0:
876; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
877; AVX-NEXT:    retq
878  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
879  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
880  ret <4 x i32> %2
881}
882
883define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) {
884; SSE-LABEL: combine_nested_undef_test8:
885; SSE:       # BB#0:
886; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
887; SSE-NEXT:    retq
888;
889; AVX-LABEL: combine_nested_undef_test8:
890; AVX:       # BB#0:
891; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
892; AVX-NEXT:    retq
893  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
894  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
895  ret <4 x i32> %2
896}
897
898define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) {
899; SSE-LABEL: combine_nested_undef_test9:
900; SSE:       # BB#0:
901; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
902; SSE-NEXT:    retq
903;
904; AVX-LABEL: combine_nested_undef_test9:
905; AVX:       # BB#0:
906; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
907; AVX-NEXT:    retq
908  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
909  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
910  ret <4 x i32> %2
911}
912
913define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) {
914; SSE-LABEL: combine_nested_undef_test10:
915; SSE:       # BB#0:
916; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
917; SSE-NEXT:    retq
918;
919; AVX-LABEL: combine_nested_undef_test10:
920; AVX:       # BB#0:
921; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
922; AVX-NEXT:    retq
923  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
924  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
925  ret <4 x i32> %2
926}
927
928define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) {
929; SSE-LABEL: combine_nested_undef_test11:
930; SSE:       # BB#0:
931; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
932; SSE-NEXT:    retq
933;
934; AVX-LABEL: combine_nested_undef_test11:
935; AVX:       # BB#0:
936; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
937; AVX-NEXT:    retq
938  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
939  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
940  ret <4 x i32> %2
941}
942
943define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
944; SSE-LABEL: combine_nested_undef_test12:
945; SSE:       # BB#0:
946; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
947; SSE-NEXT:    retq
948;
949; AVX1-LABEL: combine_nested_undef_test12:
950; AVX1:       # BB#0:
951; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
952; AVX1-NEXT:    retq
953;
954; AVX2-LABEL: combine_nested_undef_test12:
955; AVX2:       # BB#0:
956; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
957; AVX2-NEXT:    retq
958  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
959  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
960  ret <4 x i32> %2
961}
962
963; The following pair of shuffles is folded into vector %A.
964define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
965; ALL-LABEL: combine_nested_undef_test13:
966; ALL:       # BB#0:
967; ALL-NEXT:    retq
968  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
969  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
970  ret <4 x i32> %2
971}
972
973; The following pair of shuffles is folded into vector %B.
974define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
975; SSE-LABEL: combine_nested_undef_test14:
976; SSE:       # BB#0:
977; SSE-NEXT:    movaps %xmm1, %xmm0
978; SSE-NEXT:    retq
979;
980; AVX-LABEL: combine_nested_undef_test14:
981; AVX:       # BB#0:
982; AVX-NEXT:    vmovaps %xmm1, %xmm0
983; AVX-NEXT:    retq
984  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
985  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
986  ret <4 x i32> %2
987}
988
989
990; Verify that we don't optimize the following cases. We expect more than one shuffle.
991;
992; FIXME: Many of these already don't make sense, and the rest should stop
993; making sense with th enew vector shuffle lowering. Revisit at least testing for
994; it.
995
996define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
997; SSE2-LABEL: combine_nested_undef_test15:
998; SSE2:       # BB#0:
999; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
1000; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
1001; SSE2-NEXT:    movaps %xmm1, %xmm0
1002; SSE2-NEXT:    retq
1003;
1004; SSSE3-LABEL: combine_nested_undef_test15:
1005; SSSE3:       # BB#0:
1006; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
1007; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
1008; SSSE3-NEXT:    movaps %xmm1, %xmm0
1009; SSSE3-NEXT:    retq
1010;
1011; SSE41-LABEL: combine_nested_undef_test15:
1012; SSE41:       # BB#0:
1013; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
1014; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1015; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1016; SSE41-NEXT:    retq
1017;
1018; AVX1-LABEL: combine_nested_undef_test15:
1019; AVX1:       # BB#0:
1020; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
1021; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1022; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1023; AVX1-NEXT:    retq
1024;
1025; AVX2-LABEL: combine_nested_undef_test15:
1026; AVX2:       # BB#0:
1027; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
1028; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1029; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1030; AVX2-NEXT:    retq
1031  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
1032  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1033  ret <4 x i32> %2
1034}
1035
1036define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
1037; SSE2-LABEL: combine_nested_undef_test16:
1038; SSE2:       # BB#0:
1039; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
1040; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
1041; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1042; SSE2-NEXT:    retq
1043;
1044; SSSE3-LABEL: combine_nested_undef_test16:
1045; SSSE3:       # BB#0:
1046; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
1047; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
1048; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1049; SSSE3-NEXT:    retq
1050;
1051; SSE41-LABEL: combine_nested_undef_test16:
1052; SSE41:       # BB#0:
1053; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1054; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1055; SSE41-NEXT:    retq
1056;
1057; AVX1-LABEL: combine_nested_undef_test16:
1058; AVX1:       # BB#0:
1059; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1060; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1061; AVX1-NEXT:    retq
1062;
1063; AVX2-LABEL: combine_nested_undef_test16:
1064; AVX2:       # BB#0:
1065; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1066; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1067; AVX2-NEXT:    retq
1068  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1069  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1070  ret <4 x i32> %2
1071}
1072
1073define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
1074; SSE2-LABEL: combine_nested_undef_test17:
1075; SSE2:       # BB#0:
1076; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
1077; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
1078; SSE2-NEXT:    retq
1079;
1080; SSSE3-LABEL: combine_nested_undef_test17:
1081; SSSE3:       # BB#0:
1082; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
1083; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
1084; SSSE3-NEXT:    retq
1085;
1086; SSE41-LABEL: combine_nested_undef_test17:
1087; SSE41:       # BB#0:
1088; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
1089; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1090; SSE41-NEXT:    retq
1091;
1092; AVX1-LABEL: combine_nested_undef_test17:
1093; AVX1:       # BB#0:
1094; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
1095; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1096; AVX1-NEXT:    retq
1097;
1098; AVX2-LABEL: combine_nested_undef_test17:
1099; AVX2:       # BB#0:
1100; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1101; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1102; AVX2-NEXT:    retq
1103  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
1104  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1105  ret <4 x i32> %2
1106}
1107
1108define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
1109; SSE-LABEL: combine_nested_undef_test18:
1110; SSE:       # BB#0:
1111; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
1112; SSE-NEXT:    retq
1113;
1114; AVX-LABEL: combine_nested_undef_test18:
1115; AVX:       # BB#0:
1116; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
1117; AVX-NEXT:    retq
1118  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1119  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
1120  ret <4 x i32> %2
1121}
1122
1123define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
1124; SSE2-LABEL: combine_nested_undef_test19:
1125; SSE2:       # BB#0:
1126; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1127; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
1128; SSE2-NEXT:    retq
1129;
1130; SSSE3-LABEL: combine_nested_undef_test19:
1131; SSSE3:       # BB#0:
1132; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1133; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
1134; SSSE3-NEXT:    retq
1135;
1136; SSE41-LABEL: combine_nested_undef_test19:
1137; SSE41:       # BB#0:
1138; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1139; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
1140; SSE41-NEXT:    retq
1141;
1142; AVX1-LABEL: combine_nested_undef_test19:
1143; AVX1:       # BB#0:
1144; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1145; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
1146; AVX1-NEXT:    retq
1147;
1148; AVX2-LABEL: combine_nested_undef_test19:
1149; AVX2:       # BB#0:
1150; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1151; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
1152; AVX2-NEXT:    retq
1153  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
1154  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
1155  ret <4 x i32> %2
1156}
1157
1158define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
1159; SSE2-LABEL: combine_nested_undef_test20:
1160; SSE2:       # BB#0:
1161; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
1162; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1163; SSE2-NEXT:    movaps %xmm1, %xmm0
1164; SSE2-NEXT:    retq
1165;
1166; SSSE3-LABEL: combine_nested_undef_test20:
1167; SSSE3:       # BB#0:
1168; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
1169; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1170; SSSE3-NEXT:    movaps %xmm1, %xmm0
1171; SSSE3-NEXT:    retq
1172;
1173; SSE41-LABEL: combine_nested_undef_test20:
1174; SSE41:       # BB#0:
1175; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1176; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1177; SSE41-NEXT:    retq
1178;
1179; AVX1-LABEL: combine_nested_undef_test20:
1180; AVX1:       # BB#0:
1181; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1182; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1183; AVX1-NEXT:    retq
1184;
1185; AVX2-LABEL: combine_nested_undef_test20:
1186; AVX2:       # BB#0:
1187; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1188; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1189; AVX2-NEXT:    retq
1190  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
1191  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1192  ret <4 x i32> %2
1193}
1194
1195define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
1196; SSE2-LABEL: combine_nested_undef_test21:
1197; SSE2:       # BB#0:
1198; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1199; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
1200; SSE2-NEXT:    retq
1201;
1202; SSSE3-LABEL: combine_nested_undef_test21:
1203; SSSE3:       # BB#0:
1204; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1205; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
1206; SSSE3-NEXT:    retq
1207;
1208; SSE41-LABEL: combine_nested_undef_test21:
1209; SSE41:       # BB#0:
1210; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1211; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1212; SSE41-NEXT:    retq
1213;
1214; AVX1-LABEL: combine_nested_undef_test21:
1215; AVX1:       # BB#0:
1216; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1217; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1218; AVX1-NEXT:    retq
1219;
1220; AVX2-LABEL: combine_nested_undef_test21:
1221; AVX2:       # BB#0:
1222; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1223; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
1224; AVX2-NEXT:    retq
1225  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
1226  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1227  ret <4 x i32> %2
1228}
1229
1230
1231; Test that we correctly combine shuffles according to rule
1232;  shuffle(shuffle(x, y), undef) -> shuffle(y, undef)
1233
1234define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) {
1235; SSE-LABEL: combine_nested_undef_test22:
1236; SSE:       # BB#0:
1237; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
1238; SSE-NEXT:    retq
1239;
1240; AVX-LABEL: combine_nested_undef_test22:
1241; AVX:       # BB#0:
1242; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
1243; AVX-NEXT:    retq
1244  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1245  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3>
1246  ret <4 x i32> %2
1247}
1248
1249define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) {
1250; SSE-LABEL: combine_nested_undef_test23:
1251; SSE:       # BB#0:
1252; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
1253; SSE-NEXT:    retq
1254;
1255; AVX-LABEL: combine_nested_undef_test23:
1256; AVX:       # BB#0:
1257; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
1258; AVX-NEXT:    retq
1259  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1260  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1261  ret <4 x i32> %2
1262}
1263
1264define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) {
1265; SSE-LABEL: combine_nested_undef_test24:
1266; SSE:       # BB#0:
1267; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
1268; SSE-NEXT:    retq
1269;
1270; AVX-LABEL: combine_nested_undef_test24:
1271; AVX:       # BB#0:
1272; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
1273; AVX-NEXT:    retq
1274  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1275  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4>
1276  ret <4 x i32> %2
1277}
1278
1279define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) {
1280; SSE-LABEL: combine_nested_undef_test25:
1281; SSE:       # BB#0:
1282; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1283; SSE-NEXT:    retq
1284;
1285; AVX1-LABEL: combine_nested_undef_test25:
1286; AVX1:       # BB#0:
1287; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1288; AVX1-NEXT:    retq
1289;
1290; AVX2-LABEL: combine_nested_undef_test25:
1291; AVX2:       # BB#0:
1292; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
1293; AVX2-NEXT:    retq
1294  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4>
1295  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1>
1296  ret <4 x i32> %2
1297}
1298
1299define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) {
1300; SSE-LABEL: combine_nested_undef_test26:
1301; SSE:       # BB#0:
1302; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1303; SSE-NEXT:    retq
1304;
1305; AVX-LABEL: combine_nested_undef_test26:
1306; AVX:       # BB#0:
1307; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1308; AVX-NEXT:    retq
1309  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7>
1310  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
1311  ret <4 x i32> %2
1312}
1313
1314define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) {
1315; SSE-LABEL: combine_nested_undef_test27:
1316; SSE:       # BB#0:
1317; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1318; SSE-NEXT:    retq
1319;
1320; AVX1-LABEL: combine_nested_undef_test27:
1321; AVX1:       # BB#0:
1322; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1323; AVX1-NEXT:    retq
1324;
1325; AVX2-LABEL: combine_nested_undef_test27:
1326; AVX2:       # BB#0:
1327; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
1328; AVX2-NEXT:    retq
1329  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4>
1330  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
1331  ret <4 x i32> %2
1332}
1333
1334define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
1335; SSE-LABEL: combine_nested_undef_test28:
1336; SSE:       # BB#0:
1337; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
1338; SSE-NEXT:    retq
1339;
1340; AVX-LABEL: combine_nested_undef_test28:
1341; AVX:       # BB#0:
1342; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
1343; AVX-NEXT:    retq
1344  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
1345  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2>
1346  ret <4 x i32> %2
1347}
1348
1349define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
1350; SSE-LABEL: combine_test1:
1351; SSE:       # BB#0:
1352; SSE-NEXT:    movaps %xmm1, %xmm0
1353; SSE-NEXT:    retq
1354;
1355; AVX-LABEL: combine_test1:
1356; AVX:       # BB#0:
1357; AVX-NEXT:    vmovaps %xmm1, %xmm0
1358; AVX-NEXT:    retq
1359  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1360  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1361  ret <4 x float> %2
1362}
1363
1364define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
1365; SSE2-LABEL: combine_test2:
1366; SSE2:       # BB#0:
1367; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1368; SSE2-NEXT:    movaps %xmm1, %xmm0
1369; SSE2-NEXT:    retq
1370;
1371; SSSE3-LABEL: combine_test2:
1372; SSSE3:       # BB#0:
1373; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1374; SSSE3-NEXT:    movaps %xmm1, %xmm0
1375; SSSE3-NEXT:    retq
1376;
1377; SSE41-LABEL: combine_test2:
1378; SSE41:       # BB#0:
1379; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1380; SSE41-NEXT:    retq
1381;
1382; AVX-LABEL: combine_test2:
1383; AVX:       # BB#0:
1384; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1385; AVX-NEXT:    retq
1386  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1387  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1388  ret <4 x float> %2
1389}
1390
1391define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
1392; SSE-LABEL: combine_test3:
1393; SSE:       # BB#0:
1394; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1395; SSE-NEXT:    retq
1396;
1397; AVX-LABEL: combine_test3:
1398; AVX:       # BB#0:
1399; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1400; AVX-NEXT:    retq
1401  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1402  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1403  ret <4 x float> %2
1404}
1405
1406define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
1407; SSE-LABEL: combine_test4:
1408; SSE:       # BB#0:
1409; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1410; SSE-NEXT:    movapd %xmm1, %xmm0
1411; SSE-NEXT:    retq
1412;
1413; AVX-LABEL: combine_test4:
1414; AVX:       # BB#0:
1415; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1416; AVX-NEXT:    retq
1417  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1418  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1419  ret <4 x float> %2
1420}
1421
1422define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
1423; SSE2-LABEL: combine_test5:
1424; SSE2:       # BB#0:
1425; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1426; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1427; SSE2-NEXT:    retq
1428;
1429; SSSE3-LABEL: combine_test5:
1430; SSSE3:       # BB#0:
1431; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1432; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1433; SSSE3-NEXT:    retq
1434;
1435; SSE41-LABEL: combine_test5:
1436; SSE41:       # BB#0:
1437; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1438; SSE41-NEXT:    retq
1439;
1440; AVX-LABEL: combine_test5:
1441; AVX:       # BB#0:
1442; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1443; AVX-NEXT:    retq
1444  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1445  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1446  ret <4 x float> %2
1447}
1448
1449define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
1450; SSE-LABEL: combine_test6:
1451; SSE:       # BB#0:
1452; SSE-NEXT:    movaps %xmm1, %xmm0
1453; SSE-NEXT:    retq
1454;
1455; AVX-LABEL: combine_test6:
1456; AVX:       # BB#0:
1457; AVX-NEXT:    vmovaps %xmm1, %xmm0
1458; AVX-NEXT:    retq
1459  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1460  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1461  ret <4 x i32> %2
1462}
1463
1464define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
1465; SSE2-LABEL: combine_test7:
1466; SSE2:       # BB#0:
1467; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1468; SSE2-NEXT:    movaps %xmm1, %xmm0
1469; SSE2-NEXT:    retq
1470;
1471; SSSE3-LABEL: combine_test7:
1472; SSSE3:       # BB#0:
1473; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1474; SSSE3-NEXT:    movaps %xmm1, %xmm0
1475; SSSE3-NEXT:    retq
1476;
1477; SSE41-LABEL: combine_test7:
1478; SSE41:       # BB#0:
1479; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1480; SSE41-NEXT:    retq
1481;
1482; AVX1-LABEL: combine_test7:
1483; AVX1:       # BB#0:
1484; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1485; AVX1-NEXT:    retq
1486;
1487; AVX2-LABEL: combine_test7:
1488; AVX2:       # BB#0:
1489; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1490; AVX2-NEXT:    retq
1491  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1492  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1493  ret <4 x i32> %2
1494}
1495
1496define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) {
1497; SSE-LABEL: combine_test8:
1498; SSE:       # BB#0:
1499; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1500; SSE-NEXT:    retq
1501;
1502; AVX-LABEL: combine_test8:
1503; AVX:       # BB#0:
1504; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1505; AVX-NEXT:    retq
1506  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1507  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1508  ret <4 x i32> %2
1509}
1510
1511define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
1512; SSE-LABEL: combine_test9:
1513; SSE:       # BB#0:
1514; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1515; SSE-NEXT:    movdqa %xmm1, %xmm0
1516; SSE-NEXT:    retq
1517;
1518; AVX-LABEL: combine_test9:
1519; AVX:       # BB#0:
1520; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1521; AVX-NEXT:    retq
1522  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1523  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1524  ret <4 x i32> %2
1525}
1526
1527define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
1528; SSE2-LABEL: combine_test10:
1529; SSE2:       # BB#0:
1530; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1531; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1532; SSE2-NEXT:    retq
1533;
1534; SSSE3-LABEL: combine_test10:
1535; SSSE3:       # BB#0:
1536; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1537; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1538; SSSE3-NEXT:    retq
1539;
1540; SSE41-LABEL: combine_test10:
1541; SSE41:       # BB#0:
1542; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1543; SSE41-NEXT:    retq
1544;
1545; AVX1-LABEL: combine_test10:
1546; AVX1:       # BB#0:
1547; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1548; AVX1-NEXT:    retq
1549;
1550; AVX2-LABEL: combine_test10:
1551; AVX2:       # BB#0:
1552; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1553; AVX2-NEXT:    retq
1554  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1555  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1556  ret <4 x i32> %2
1557}
1558
1559define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
1560; ALL-LABEL: combine_test11:
1561; ALL:       # BB#0:
1562; ALL-NEXT:    retq
1563  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1564  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1565  ret <4 x float> %2
1566}
1567
1568define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
1569; SSE2-LABEL: combine_test12:
1570; SSE2:       # BB#0:
1571; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1572; SSE2-NEXT:    movaps %xmm1, %xmm0
1573; SSE2-NEXT:    retq
1574;
1575; SSSE3-LABEL: combine_test12:
1576; SSSE3:       # BB#0:
1577; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1578; SSSE3-NEXT:    movaps %xmm1, %xmm0
1579; SSSE3-NEXT:    retq
1580;
1581; SSE41-LABEL: combine_test12:
1582; SSE41:       # BB#0:
1583; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1584; SSE41-NEXT:    retq
1585;
1586; AVX-LABEL: combine_test12:
1587; AVX:       # BB#0:
1588; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1589; AVX-NEXT:    retq
1590  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1591  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1592  ret <4 x float> %2
1593}
1594
1595define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) {
1596; SSE-LABEL: combine_test13:
1597; SSE:       # BB#0:
1598; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1599; SSE-NEXT:    retq
1600;
1601; AVX-LABEL: combine_test13:
1602; AVX:       # BB#0:
1603; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1604; AVX-NEXT:    retq
1605  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1606  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1607  ret <4 x float> %2
1608}
1609
1610define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
1611; SSE-LABEL: combine_test14:
1612; SSE:       # BB#0:
1613; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1614; SSE-NEXT:    retq
1615;
1616; AVX-LABEL: combine_test14:
1617; AVX:       # BB#0:
1618; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1619; AVX-NEXT:    retq
1620  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1621  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1622  ret <4 x float> %2
1623}
1624
1625define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
1626; SSE2-LABEL: combine_test15:
1627; SSE2:       # BB#0:
1628; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1629; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1630; SSE2-NEXT:    retq
1631;
1632; SSSE3-LABEL: combine_test15:
1633; SSSE3:       # BB#0:
1634; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1635; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1636; SSSE3-NEXT:    retq
1637;
1638; SSE41-LABEL: combine_test15:
1639; SSE41:       # BB#0:
1640; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1641; SSE41-NEXT:    retq
1642;
1643; AVX-LABEL: combine_test15:
1644; AVX:       # BB#0:
1645; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1646; AVX-NEXT:    retq
1647  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1648  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1649  ret <4 x float> %2
1650}
1651
1652define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
1653; ALL-LABEL: combine_test16:
1654; ALL:       # BB#0:
1655; ALL-NEXT:    retq
1656  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1657  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1658  ret <4 x i32> %2
1659}
1660
1661define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
1662; SSE2-LABEL: combine_test17:
1663; SSE2:       # BB#0:
1664; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1665; SSE2-NEXT:    movaps %xmm1, %xmm0
1666; SSE2-NEXT:    retq
1667;
1668; SSSE3-LABEL: combine_test17:
1669; SSSE3:       # BB#0:
1670; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1671; SSSE3-NEXT:    movaps %xmm1, %xmm0
1672; SSSE3-NEXT:    retq
1673;
1674; SSE41-LABEL: combine_test17:
1675; SSE41:       # BB#0:
1676; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1677; SSE41-NEXT:    retq
1678;
1679; AVX1-LABEL: combine_test17:
1680; AVX1:       # BB#0:
1681; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1682; AVX1-NEXT:    retq
1683;
1684; AVX2-LABEL: combine_test17:
1685; AVX2:       # BB#0:
1686; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1687; AVX2-NEXT:    retq
1688  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1689  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1690  ret <4 x i32> %2
1691}
1692
1693define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) {
1694; SSE-LABEL: combine_test18:
1695; SSE:       # BB#0:
1696; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1697; SSE-NEXT:    retq
1698;
1699; AVX-LABEL: combine_test18:
1700; AVX:       # BB#0:
1701; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1702; AVX-NEXT:    retq
1703  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1704  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1705  ret <4 x i32> %2
1706}
1707
1708define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
1709; SSE-LABEL: combine_test19:
1710; SSE:       # BB#0:
1711; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1712; SSE-NEXT:    retq
1713;
1714; AVX-LABEL: combine_test19:
1715; AVX:       # BB#0:
1716; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1717; AVX-NEXT:    retq
1718  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1719  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1720  ret <4 x i32> %2
1721}
1722
1723define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
1724; SSE2-LABEL: combine_test20:
1725; SSE2:       # BB#0:
1726; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1727; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1728; SSE2-NEXT:    retq
1729;
1730; SSSE3-LABEL: combine_test20:
1731; SSSE3:       # BB#0:
1732; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1733; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1734; SSSE3-NEXT:    retq
1735;
1736; SSE41-LABEL: combine_test20:
1737; SSE41:       # BB#0:
1738; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1739; SSE41-NEXT:    retq
1740;
1741; AVX1-LABEL: combine_test20:
1742; AVX1:       # BB#0:
1743; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1744; AVX1-NEXT:    retq
1745;
1746; AVX2-LABEL: combine_test20:
1747; AVX2:       # BB#0:
1748; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1749; AVX2-NEXT:    retq
1750  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1751  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1752  ret <4 x i32> %2
1753}
1754
1755define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) {
1756; SSE-LABEL: combine_test21:
1757; SSE:       # BB#0:
1758; SSE-NEXT:    movdqa %xmm0, %xmm2
1759; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1760; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1761; SSE-NEXT:    movdqa %xmm2, (%rdi)
1762; SSE-NEXT:    retq
1763;
1764; AVX1-LABEL: combine_test21:
1765; AVX1:       # BB#0:
1766; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1767; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0]
1768; AVX1-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1769; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
1770; AVX1-NEXT:    vzeroupper
1771; AVX1-NEXT:    retq
1772;
1773; AVX2-LABEL: combine_test21:
1774; AVX2:       # BB#0:
1775; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1776; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0]
1777; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1778; AVX2-NEXT:    vmovdqa %xmm2, (%rdi)
1779; AVX2-NEXT:    vzeroupper
1780; AVX2-NEXT:    retq
1781  %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1782  %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1783  store <4 x i32> %1, <4 x i32>* %ptr, align 16
1784  ret <4 x i32> %2
1785}
1786
1787define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) {
1788; SSE-LABEL: combine_test22:
1789; SSE:       # BB#0:
1790; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
1791; SSE-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
1792; SSE-NEXT:    retq
1793;
1794; AVX-LABEL: combine_test22:
1795; AVX:       # BB#0:
1796; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1797; AVX-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
1798; AVX-NEXT:    retq
1799; Current AVX2 lowering of this is still awful, not adding a test case.
1800  %1 = load <2 x float>, <2 x float>* %a, align 8
1801  %2 = load <2 x float>, <2 x float>* %b, align 8
1802  %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1803  ret <8 x float> %3
1804}
1805
1806; PR22359
1807define void @combine_test23(<8 x float> %v, <2 x float>* %ptr) {
1808; SSE-LABEL: combine_test23:
1809; SSE:       # BB#0:
1810; SSE-NEXT:    movups %xmm0, (%rdi)
1811; SSE-NEXT:    retq
1812;
1813; AVX-LABEL: combine_test23:
1814; AVX:       # BB#0:
1815; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1816; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3]
1817; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1818; AVX-NEXT:    vmovups %xmm0, (%rdi)
1819; AVX-NEXT:    vzeroupper
1820; AVX-NEXT:    retq
1821  %idx2 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 1
1822  %shuffle0 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 0, i32 1>
1823  %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 2, i32 3>
1824  store <2 x float> %shuffle0, <2 x float>* %ptr, align 8
1825  store <2 x float> %shuffle1, <2 x float>* %idx2, align 8
1826  ret void
1827}
1828
1829; Check some negative cases.
1830; FIXME: Do any of these really make sense? Are they redundant with the above tests?
1831
1832define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
1833; SSE-LABEL: combine_test1b:
1834; SSE:       # BB#0:
1835; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
1836; SSE-NEXT:    movaps %xmm1, %xmm0
1837; SSE-NEXT:    retq
1838;
1839; AVX-LABEL: combine_test1b:
1840; AVX:       # BB#0:
1841; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0]
1842; AVX-NEXT:    retq
1843  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1844  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0>
1845  ret <4 x float> %2
1846}
1847
1848define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
1849; SSE2-LABEL: combine_test2b:
1850; SSE2:       # BB#0:
1851; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0,0]
1852; SSE2-NEXT:    movaps %xmm1, %xmm0
1853; SSE2-NEXT:    retq
1854;
1855; SSSE3-LABEL: combine_test2b:
1856; SSSE3:       # BB#0:
1857; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
1858; SSSE3-NEXT:    retq
1859;
1860; SSE41-LABEL: combine_test2b:
1861; SSE41:       # BB#0:
1862; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
1863; SSE41-NEXT:    retq
1864;
1865; AVX-LABEL: combine_test2b:
1866; AVX:       # BB#0:
1867; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm1[0,0]
1868; AVX-NEXT:    retq
1869  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1870  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5>
1871  ret <4 x float> %2
1872}
1873
1874define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
1875; SSE2-LABEL: combine_test3b:
1876; SSE2:       # BB#0:
1877; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1878; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1879; SSE2-NEXT:    retq
1880;
1881; SSSE3-LABEL: combine_test3b:
1882; SSSE3:       # BB#0:
1883; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1884; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1885; SSSE3-NEXT:    retq
1886;
1887; SSE41-LABEL: combine_test3b:
1888; SSE41:       # BB#0:
1889; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1890; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1891; SSE41-NEXT:    retq
1892;
1893; AVX-LABEL: combine_test3b:
1894; AVX:       # BB#0:
1895; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1896; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1897; AVX-NEXT:    retq
1898  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
1899  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>
1900  ret <4 x float> %2
1901}
1902
1903define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
1904; SSE-LABEL: combine_test4b:
1905; SSE:       # BB#0:
1906; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
1907; SSE-NEXT:    movaps %xmm1, %xmm0
1908; SSE-NEXT:    retq
1909;
1910; AVX-LABEL: combine_test4b:
1911; AVX:       # BB#0:
1912; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3]
1913; AVX-NEXT:    retq
1914  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1915  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7>
1916  ret <4 x float> %2
1917}
1918
1919
1920; Verify that we correctly fold shuffles even when we use illegal vector types.
1921
1922define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
1923; SSE2-LABEL: combine_test1c:
1924; SSE2:       # BB#0:
1925; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1926; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1927; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1928; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1929; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1930; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1931; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1932; SSE2-NEXT:    retq
1933;
1934; SSSE3-LABEL: combine_test1c:
1935; SSSE3:       # BB#0:
1936; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1937; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1938; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1939; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1940; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1941; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1942; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1943; SSSE3-NEXT:    retq
1944;
1945; SSE41-LABEL: combine_test1c:
1946; SSE41:       # BB#0:
1947; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1948; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1949; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
1950; SSE41-NEXT:    retq
1951;
1952; AVX1-LABEL: combine_test1c:
1953; AVX1:       # BB#0:
1954; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1955; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1956; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1957; AVX1-NEXT:    retq
1958;
1959; AVX2-LABEL: combine_test1c:
1960; AVX2:       # BB#0:
1961; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1962; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1963; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1964; AVX2-NEXT:    retq
1965  %A = load <4 x i8>, <4 x i8>* %a
1966  %B = load <4 x i8>, <4 x i8>* %b
1967  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1968  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1969  ret <4 x i8> %2
1970}
1971
1972define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
1973; SSE2-LABEL: combine_test2c:
1974; SSE2:       # BB#0:
1975; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1976; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1977; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1978; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1979; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1980; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1981; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1982; SSE2-NEXT:    retq
1983;
1984; SSSE3-LABEL: combine_test2c:
1985; SSSE3:       # BB#0:
1986; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1987; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1988; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1989; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1990; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1991; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1992; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1993; SSSE3-NEXT:    retq
1994;
1995; SSE41-LABEL: combine_test2c:
1996; SSE41:       # BB#0:
1997; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1998; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1999; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2000; SSE41-NEXT:    retq
2001;
2002; AVX-LABEL: combine_test2c:
2003; AVX:       # BB#0:
2004; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2005; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2006; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2007; AVX-NEXT:    retq
2008  %A = load <4 x i8>, <4 x i8>* %a
2009  %B = load <4 x i8>, <4 x i8>* %b
2010  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5>
2011  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2012  ret <4 x i8> %2
2013}
2014
2015define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
2016; SSE2-LABEL: combine_test3c:
2017; SSE2:       # BB#0:
2018; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2019; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2020; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2021; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2022; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2023; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2024; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2025; SSE2-NEXT:    retq
2026;
2027; SSSE3-LABEL: combine_test3c:
2028; SSSE3:       # BB#0:
2029; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2030; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2031; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2032; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2033; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2034; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2035; SSSE3-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2036; SSSE3-NEXT:    retq
2037;
2038; SSE41-LABEL: combine_test3c:
2039; SSE41:       # BB#0:
2040; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2041; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2042; SSE41-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2043; SSE41-NEXT:    retq
2044;
2045; AVX-LABEL: combine_test3c:
2046; AVX:       # BB#0:
2047; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2048; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2049; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2050; AVX-NEXT:    retq
2051  %A = load <4 x i8>, <4 x i8>* %a
2052  %B = load <4 x i8>, <4 x i8>* %b
2053  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2054  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2055  ret <4 x i8> %2
2056}
2057
2058define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
2059; SSE2-LABEL: combine_test4c:
2060; SSE2:       # BB#0:
2061; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2062; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2063; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2064; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2065; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2066; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2067; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
2068; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
2069; SSE2-NEXT:    retq
2070;
2071; SSSE3-LABEL: combine_test4c:
2072; SSSE3:       # BB#0:
2073; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2074; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2075; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2076; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2077; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2078; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2079; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
2080; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
2081; SSSE3-NEXT:    retq
2082;
2083; SSE41-LABEL: combine_test4c:
2084; SSE41:       # BB#0:
2085; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2086; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2087; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
2088; SSE41-NEXT:    retq
2089;
2090; AVX1-LABEL: combine_test4c:
2091; AVX1:       # BB#0:
2092; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2093; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2094; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
2095; AVX1-NEXT:    retq
2096;
2097; AVX2-LABEL: combine_test4c:
2098; AVX2:       # BB#0:
2099; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2100; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2101; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
2102; AVX2-NEXT:    retq
2103  %A = load <4 x i8>, <4 x i8>* %a
2104  %B = load <4 x i8>, <4 x i8>* %b
2105  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
2106  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
2107  ret <4 x i8> %2
2108}
2109
2110
2111; The following test cases are generated from this C++ code
2112;
2113;__m128 blend_01(__m128 a, __m128 b)
2114;{
2115;  __m128 s = a;
2116;  s = _mm_blend_ps( s, b, 1<<0 );
2117;  s = _mm_blend_ps( s, b, 1<<1 );
2118;  return s;
2119;}
2120;
2121;__m128 blend_02(__m128 a, __m128 b)
2122;{
2123;  __m128 s = a;
2124;  s = _mm_blend_ps( s, b, 1<<0 );
2125;  s = _mm_blend_ps( s, b, 1<<2 );
2126;  return s;
2127;}
2128;
2129;__m128 blend_123(__m128 a, __m128 b)
2130;{
2131;  __m128 s = a;
2132;  s = _mm_blend_ps( s, b, 1<<1 );
2133;  s = _mm_blend_ps( s, b, 1<<2 );
2134;  s = _mm_blend_ps( s, b, 1<<3 );
2135;  return s;
2136;}
2137
2138; Ideally, we should collapse the following shuffles into a single one.
2139
2140define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
2141; SSE2-LABEL: combine_blend_01:
2142; SSE2:       # BB#0:
2143; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2144; SSE2-NEXT:    retq
2145;
2146; SSSE3-LABEL: combine_blend_01:
2147; SSSE3:       # BB#0:
2148; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2149; SSSE3-NEXT:    retq
2150;
2151; SSE41-LABEL: combine_blend_01:
2152; SSE41:       # BB#0:
2153; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2154; SSE41-NEXT:    retq
2155;
2156; AVX-LABEL: combine_blend_01:
2157; AVX:       # BB#0:
2158; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2159; AVX-NEXT:    retq
2160  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
2161  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
2162  ret <4 x float> %shuffle6
2163}
2164
2165define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
2166; SSE2-LABEL: combine_blend_02:
2167; SSE2:       # BB#0:
2168; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
2169; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
2170; SSE2-NEXT:    movaps %xmm1, %xmm0
2171; SSE2-NEXT:    retq
2172;
2173; SSSE3-LABEL: combine_blend_02:
2174; SSSE3:       # BB#0:
2175; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
2176; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
2177; SSSE3-NEXT:    movaps %xmm1, %xmm0
2178; SSSE3-NEXT:    retq
2179;
2180; SSE41-LABEL: combine_blend_02:
2181; SSE41:       # BB#0:
2182; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2183; SSE41-NEXT:    retq
2184;
2185; AVX-LABEL: combine_blend_02:
2186; AVX:       # BB#0:
2187; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2188; AVX-NEXT:    retq
2189  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3>
2190  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
2191  ret <4 x float> %shuffle6
2192}
2193
2194define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
2195; SSE2-LABEL: combine_blend_123:
2196; SSE2:       # BB#0:
2197; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2198; SSE2-NEXT:    movaps %xmm1, %xmm0
2199; SSE2-NEXT:    retq
2200;
2201; SSSE3-LABEL: combine_blend_123:
2202; SSSE3:       # BB#0:
2203; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2204; SSSE3-NEXT:    movaps %xmm1, %xmm0
2205; SSSE3-NEXT:    retq
2206;
2207; SSE41-LABEL: combine_blend_123:
2208; SSE41:       # BB#0:
2209; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2210; SSE41-NEXT:    retq
2211;
2212; AVX-LABEL: combine_blend_123:
2213; AVX:       # BB#0:
2214; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2215; AVX-NEXT:    retq
2216  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
2217  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
2218  %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
2219  ret <4 x float> %shuffle12
2220}
2221
2222define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) {
2223; SSE-LABEL: combine_test_movhl_1:
2224; SSE:       # BB#0:
2225; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2226; SSE-NEXT:    movdqa %xmm1, %xmm0
2227; SSE-NEXT:    retq
2228;
2229; AVX-LABEL: combine_test_movhl_1:
2230; AVX:       # BB#0:
2231; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2232; AVX-NEXT:    retq
2233  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3>
2234  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3>
2235  ret <4 x i32> %2
2236}
2237
2238define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) {
2239; SSE-LABEL: combine_test_movhl_2:
2240; SSE:       # BB#0:
2241; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2242; SSE-NEXT:    movdqa %xmm1, %xmm0
2243; SSE-NEXT:    retq
2244;
2245; AVX-LABEL: combine_test_movhl_2:
2246; AVX:       # BB#0:
2247; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2248; AVX-NEXT:    retq
2249  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6>
2250  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2>
2251  ret <4 x i32> %2
2252}
2253
2254define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
2255; SSE-LABEL: combine_test_movhl_3:
2256; SSE:       # BB#0:
2257; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2258; SSE-NEXT:    movdqa %xmm1, %xmm0
2259; SSE-NEXT:    retq
2260;
2261; AVX-LABEL: combine_test_movhl_3:
2262; AVX:       # BB#0:
2263; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2264; AVX-NEXT:    retq
2265  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2>
2266  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2>
2267  ret <4 x i32> %2
2268}
2269
2270
2271; Verify that we fold shuffles according to rule:
2272;  (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2)
2273
2274define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) {
2275; SSE2-LABEL: combine_undef_input_test1:
2276; SSE2:       # BB#0:
2277; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2278; SSE2-NEXT:    retq
2279;
2280; SSSE3-LABEL: combine_undef_input_test1:
2281; SSSE3:       # BB#0:
2282; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2283; SSSE3-NEXT:    retq
2284;
2285; SSE41-LABEL: combine_undef_input_test1:
2286; SSE41:       # BB#0:
2287; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2288; SSE41-NEXT:    retq
2289;
2290; AVX-LABEL: combine_undef_input_test1:
2291; AVX:       # BB#0:
2292; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2293; AVX-NEXT:    retq
2294  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2295  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2296  ret <4 x float> %2
2297}
2298
2299define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) {
2300; SSE-LABEL: combine_undef_input_test2:
2301; SSE:       # BB#0:
2302; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2303; SSE-NEXT:    retq
2304;
2305; AVX-LABEL: combine_undef_input_test2:
2306; AVX:       # BB#0:
2307; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2308; AVX-NEXT:    retq
2309  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2310  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2311  ret <4 x float> %2
2312}
2313
2314define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) {
2315; SSE-LABEL: combine_undef_input_test3:
2316; SSE:       # BB#0:
2317; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2318; SSE-NEXT:    retq
2319;
2320; AVX-LABEL: combine_undef_input_test3:
2321; AVX:       # BB#0:
2322; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2323; AVX-NEXT:    retq
2324  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2325  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2326  ret <4 x float> %2
2327}
2328
2329define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
2330; SSE-LABEL: combine_undef_input_test4:
2331; SSE:       # BB#0:
2332; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2333; SSE-NEXT:    movapd %xmm1, %xmm0
2334; SSE-NEXT:    retq
2335;
2336; AVX-LABEL: combine_undef_input_test4:
2337; AVX:       # BB#0:
2338; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2339; AVX-NEXT:    retq
2340  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2341  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2342  ret <4 x float> %2
2343}
2344
2345define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) {
2346; SSE2-LABEL: combine_undef_input_test5:
2347; SSE2:       # BB#0:
2348; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2349; SSE2-NEXT:    movapd %xmm1, %xmm0
2350; SSE2-NEXT:    retq
2351;
2352; SSSE3-LABEL: combine_undef_input_test5:
2353; SSSE3:       # BB#0:
2354; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2355; SSSE3-NEXT:    movapd %xmm1, %xmm0
2356; SSSE3-NEXT:    retq
2357;
2358; SSE41-LABEL: combine_undef_input_test5:
2359; SSE41:       # BB#0:
2360; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2361; SSE41-NEXT:    retq
2362;
2363; AVX-LABEL: combine_undef_input_test5:
2364; AVX:       # BB#0:
2365; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2366; AVX-NEXT:    retq
2367  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2368  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2369  ret <4 x float> %2
2370}
2371
2372
2373; Verify that we fold shuffles according to rule:
2374;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2375
2376define <4 x float> @combine_undef_input_test6(<4 x float> %a) {
2377; ALL-LABEL: combine_undef_input_test6:
2378; ALL:       # BB#0:
2379; ALL-NEXT:    retq
2380  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2381  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2382  ret <4 x float> %2
2383}
2384
2385define <4 x float> @combine_undef_input_test7(<4 x float> %a) {
2386; SSE2-LABEL: combine_undef_input_test7:
2387; SSE2:       # BB#0:
2388; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2389; SSE2-NEXT:    retq
2390;
2391; SSSE3-LABEL: combine_undef_input_test7:
2392; SSSE3:       # BB#0:
2393; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2394; SSSE3-NEXT:    retq
2395;
2396; SSE41-LABEL: combine_undef_input_test7:
2397; SSE41:       # BB#0:
2398; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2399; SSE41-NEXT:    retq
2400;
2401; AVX-LABEL: combine_undef_input_test7:
2402; AVX:       # BB#0:
2403; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2404; AVX-NEXT:    retq
2405  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2406  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2407  ret <4 x float> %2
2408}
2409
2410define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
2411; SSE2-LABEL: combine_undef_input_test8:
2412; SSE2:       # BB#0:
2413; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2414; SSE2-NEXT:    retq
2415;
2416; SSSE3-LABEL: combine_undef_input_test8:
2417; SSSE3:       # BB#0:
2418; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2419; SSSE3-NEXT:    retq
2420;
2421; SSE41-LABEL: combine_undef_input_test8:
2422; SSE41:       # BB#0:
2423; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2424; SSE41-NEXT:    retq
2425;
2426; AVX-LABEL: combine_undef_input_test8:
2427; AVX:       # BB#0:
2428; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2429; AVX-NEXT:    retq
2430  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2431  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2432  ret <4 x float> %2
2433}
2434
2435define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
2436; SSE-LABEL: combine_undef_input_test9:
2437; SSE:       # BB#0:
2438; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
2439; SSE-NEXT:    retq
2440;
2441; AVX-LABEL: combine_undef_input_test9:
2442; AVX:       # BB#0:
2443; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
2444; AVX-NEXT:    retq
2445  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2446  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2447  ret <4 x float> %2
2448}
2449
2450define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
2451; ALL-LABEL: combine_undef_input_test10:
2452; ALL:       # BB#0:
2453; ALL-NEXT:    retq
2454  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2455  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2456  ret <4 x float> %2
2457}
2458
2459define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) {
2460; SSE2-LABEL: combine_undef_input_test11:
2461; SSE2:       # BB#0:
2462; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2463; SSE2-NEXT:    retq
2464;
2465; SSSE3-LABEL: combine_undef_input_test11:
2466; SSSE3:       # BB#0:
2467; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2468; SSSE3-NEXT:    retq
2469;
2470; SSE41-LABEL: combine_undef_input_test11:
2471; SSE41:       # BB#0:
2472; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2473; SSE41-NEXT:    retq
2474;
2475; AVX-LABEL: combine_undef_input_test11:
2476; AVX:       # BB#0:
2477; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2478; AVX-NEXT:    retq
2479  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2480  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6>
2481  ret <4 x float> %2
2482}
2483
2484define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) {
2485; SSE-LABEL: combine_undef_input_test12:
2486; SSE:       # BB#0:
2487; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2488; SSE-NEXT:    retq
2489;
2490; AVX-LABEL: combine_undef_input_test12:
2491; AVX:       # BB#0:
2492; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2493; AVX-NEXT:    retq
2494  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2495  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2496  ret <4 x float> %2
2497}
2498
2499define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) {
2500; SSE-LABEL: combine_undef_input_test13:
2501; SSE:       # BB#0:
2502; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2503; SSE-NEXT:    retq
2504;
2505; AVX-LABEL: combine_undef_input_test13:
2506; AVX:       # BB#0:
2507; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2508; AVX-NEXT:    retq
2509  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2510  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5>
2511  ret <4 x float> %2
2512}
2513
2514define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
2515; SSE-LABEL: combine_undef_input_test14:
2516; SSE:       # BB#0:
2517; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2518; SSE-NEXT:    movapd %xmm1, %xmm0
2519; SSE-NEXT:    retq
2520;
2521; AVX-LABEL: combine_undef_input_test14:
2522; AVX:       # BB#0:
2523; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2524; AVX-NEXT:    retq
2525  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2526  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2527  ret <4 x float> %2
2528}
2529
2530define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) {
2531; SSE2-LABEL: combine_undef_input_test15:
2532; SSE2:       # BB#0:
2533; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2534; SSE2-NEXT:    movapd %xmm1, %xmm0
2535; SSE2-NEXT:    retq
2536;
2537; SSSE3-LABEL: combine_undef_input_test15:
2538; SSSE3:       # BB#0:
2539; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2540; SSSE3-NEXT:    movapd %xmm1, %xmm0
2541; SSSE3-NEXT:    retq
2542;
2543; SSE41-LABEL: combine_undef_input_test15:
2544; SSE41:       # BB#0:
2545; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2546; SSE41-NEXT:    retq
2547;
2548; AVX-LABEL: combine_undef_input_test15:
2549; AVX:       # BB#0:
2550; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2551; AVX-NEXT:    retq
2552  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2553  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2554  ret <4 x float> %2
2555}
2556
2557
2558; Verify that shuffles are canonicalized according to rules:
2559;  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
2560;
2561; This allows to trigger the following combine rule:
2562;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2563;
2564; As a result, all the shuffle pairs in each function below should be
2565; combined into a single legal shuffle operation.
2566
2567define <4 x float> @combine_undef_input_test16(<4 x float> %a) {
2568; ALL-LABEL: combine_undef_input_test16:
2569; ALL:       # BB#0:
2570; ALL-NEXT:    retq
2571  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2572  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
2573  ret <4 x float> %2
2574}
2575
2576define <4 x float> @combine_undef_input_test17(<4 x float> %a) {
2577; SSE2-LABEL: combine_undef_input_test17:
2578; SSE2:       # BB#0:
2579; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2580; SSE2-NEXT:    retq
2581;
2582; SSSE3-LABEL: combine_undef_input_test17:
2583; SSSE3:       # BB#0:
2584; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2585; SSSE3-NEXT:    retq
2586;
2587; SSE41-LABEL: combine_undef_input_test17:
2588; SSE41:       # BB#0:
2589; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2590; SSE41-NEXT:    retq
2591;
2592; AVX-LABEL: combine_undef_input_test17:
2593; AVX:       # BB#0:
2594; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2595; AVX-NEXT:    retq
2596  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2597  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2598  ret <4 x float> %2
2599}
2600
2601define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
2602; SSE2-LABEL: combine_undef_input_test18:
2603; SSE2:       # BB#0:
2604; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2605; SSE2-NEXT:    retq
2606;
2607; SSSE3-LABEL: combine_undef_input_test18:
2608; SSSE3:       # BB#0:
2609; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2610; SSSE3-NEXT:    retq
2611;
2612; SSE41-LABEL: combine_undef_input_test18:
2613; SSE41:       # BB#0:
2614; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2615; SSE41-NEXT:    retq
2616;
2617; AVX-LABEL: combine_undef_input_test18:
2618; AVX:       # BB#0:
2619; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2620; AVX-NEXT:    retq
2621  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2622  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
2623  ret <4 x float> %2
2624}
2625
2626define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
2627; SSE-LABEL: combine_undef_input_test19:
2628; SSE:       # BB#0:
2629; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
2630; SSE-NEXT:    retq
2631;
2632; AVX-LABEL: combine_undef_input_test19:
2633; AVX:       # BB#0:
2634; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
2635; AVX-NEXT:    retq
2636  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2637  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2638  ret <4 x float> %2
2639}
2640
2641define <4 x float> @combine_undef_input_test20(<4 x float> %a) {
2642; ALL-LABEL: combine_undef_input_test20:
2643; ALL:       # BB#0:
2644; ALL-NEXT:    retq
2645  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2646  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2647  ret <4 x float> %2
2648}
2649
2650; These tests are designed to test the ability to combine away unnecessary
2651; operations feeding into a shuffle. The AVX cases are the important ones as
2652; they leverage operations which cannot be done naturally on the entire vector
2653; and thus are decomposed into multiple smaller operations.
2654
2655define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
2656; SSE-LABEL: combine_unneeded_subvector1:
2657; SSE:       # BB#0:
2658; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
2659; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0]
2660; SSE-NEXT:    movdqa %xmm0, %xmm1
2661; SSE-NEXT:    retq
2662;
2663; AVX1-LABEL: combine_unneeded_subvector1:
2664; AVX1:       # BB#0:
2665; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2666; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
2667; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2668; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2669; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
2670; AVX1-NEXT:    retq
2671;
2672; AVX2-LABEL: combine_unneeded_subvector1:
2673; AVX2:       # BB#0:
2674; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
2675; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2676; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
2677; AVX2-NEXT:    retq
2678  %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2679  %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
2680  ret <8 x i32> %c
2681}
2682
2683define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
2684; SSE-LABEL: combine_unneeded_subvector2:
2685; SSE:       # BB#0:
2686; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
2687; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0]
2688; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
2689; SSE-NEXT:    retq
2690;
2691; AVX1-LABEL: combine_unneeded_subvector2:
2692; AVX1:       # BB#0:
2693; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2694; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
2695; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2696; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2697; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2698; AVX1-NEXT:    retq
2699;
2700; AVX2-LABEL: combine_unneeded_subvector2:
2701; AVX2:       # BB#0:
2702; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
2703; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2704; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2705; AVX2-NEXT:    retq
2706  %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2707  %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
2708  ret <8 x i32> %d
2709}
2710
2711define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
2712; SSE2-LABEL: combine_insertps1:
2713; SSE2:       # BB#0:
2714; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2715; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2716; SSE2-NEXT:    movaps %xmm1, %xmm0
2717; SSE2-NEXT:    retq
2718;
2719; SSSE3-LABEL: combine_insertps1:
2720; SSSE3:       # BB#0:
2721; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2722; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2723; SSSE3-NEXT:    movaps %xmm1, %xmm0
2724; SSSE3-NEXT:    retq
2725;
2726; SSE41-LABEL: combine_insertps1:
2727; SSE41:       # BB#0:
2728; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2729; SSE41-NEXT:    retq
2730;
2731; AVX-LABEL: combine_insertps1:
2732; AVX:       # BB#0:
2733; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2734; AVX-NEXT:    retq
2735
2736  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4>
2737  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
2738  ret <4 x float> %d
2739}
2740
2741define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
2742; SSE2-LABEL: combine_insertps2:
2743; SSE2:       # BB#0:
2744; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2745; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2746; SSE2-NEXT:    movaps %xmm1, %xmm0
2747; SSE2-NEXT:    retq
2748;
2749; SSSE3-LABEL: combine_insertps2:
2750; SSSE3:       # BB#0:
2751; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2752; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2753; SSSE3-NEXT:    movaps %xmm1, %xmm0
2754; SSSE3-NEXT:    retq
2755;
2756; SSE41-LABEL: combine_insertps2:
2757; SSE41:       # BB#0:
2758; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2759; SSE41-NEXT:    retq
2760;
2761; AVX-LABEL: combine_insertps2:
2762; AVX:       # BB#0:
2763; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2764; AVX-NEXT:    retq
2765
2766  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7>
2767  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2768  ret <4 x float> %d
2769}
2770
2771define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
2772; SSE2-LABEL: combine_insertps3:
2773; SSE2:       # BB#0:
2774; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2775; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2776; SSE2-NEXT:    retq
2777;
2778; SSSE3-LABEL: combine_insertps3:
2779; SSSE3:       # BB#0:
2780; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2781; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2782; SSSE3-NEXT:    retq
2783;
2784; SSE41-LABEL: combine_insertps3:
2785; SSE41:       # BB#0:
2786; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2787; SSE41-NEXT:    retq
2788;
2789; AVX-LABEL: combine_insertps3:
2790; AVX:       # BB#0:
2791; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2792; AVX-NEXT:    retq
2793
2794  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2795  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3>
2796  ret <4 x float> %d
2797}
2798
2799define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
2800; SSE2-LABEL: combine_insertps4:
2801; SSE2:       # BB#0:
2802; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
2803; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2804; SSE2-NEXT:    retq
2805;
2806; SSSE3-LABEL: combine_insertps4:
2807; SSSE3:       # BB#0:
2808; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
2809; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2810; SSSE3-NEXT:    retq
2811;
2812; SSE41-LABEL: combine_insertps4:
2813; SSE41:       # BB#0:
2814; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2815; SSE41-NEXT:    retq
2816;
2817; AVX-LABEL: combine_insertps4:
2818; AVX:       # BB#0:
2819; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2820; AVX-NEXT:    retq
2821
2822  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2823  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5>
2824  ret <4 x float> %d
2825}
2826
2827; FIXME: Failed to recognise that the VMOVSD has already zero'd the upper element
2828define void @combine_scalar_load_with_blend_with_zero(double* %a0, <4 x float>* %a1) {
2829; SSE2-LABEL: combine_scalar_load_with_blend_with_zero:
2830; SSE2:       # BB#0:
2831; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2832; SSE2-NEXT:    xorps %xmm1, %xmm1
2833; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2834; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2835; SSE2-NEXT:    movaps %xmm0, (%rsi)
2836; SSE2-NEXT:    retq
2837;
2838; SSSE3-LABEL: combine_scalar_load_with_blend_with_zero:
2839; SSSE3:       # BB#0:
2840; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2841; SSSE3-NEXT:    xorps %xmm1, %xmm1
2842; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2843; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2844; SSSE3-NEXT:    movaps %xmm0, (%rsi)
2845; SSSE3-NEXT:    retq
2846;
2847; SSE41-LABEL: combine_scalar_load_with_blend_with_zero:
2848; SSE41:       # BB#0:
2849; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2850; SSE41-NEXT:    xorpd %xmm1, %xmm1
2851; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2852; SSE41-NEXT:    movapd %xmm1, (%rsi)
2853; SSE41-NEXT:    retq
2854;
2855; AVX-LABEL: combine_scalar_load_with_blend_with_zero:
2856; AVX:       # BB#0:
2857; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
2858; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
2859; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2860; AVX-NEXT:    vmovapd %xmm0, (%rsi)
2861; AVX-NEXT:    retq
2862  %1 = load double, double* %a0, align 8
2863  %2 = insertelement <2 x double> undef, double %1, i32 0
2864  %3 = insertelement <2 x double> %2, double 0.000000e+00, i32 1
2865  %4 = bitcast <2 x double> %3 to <4 x float>
2866  %5 = shufflevector <4 x float> %4, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
2867  store <4 x float> %5, <4 x float>* %a1, align 16
2868  ret void
2869}
2870
2871define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
2872; SSE-LABEL: PR22377:
2873; SSE:       # BB#0: # %entry
2874; SSE-NEXT:    movaps %xmm0, %xmm1
2875; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
2876; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
2877; SSE-NEXT:    addps %xmm0, %xmm1
2878; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2879; SSE-NEXT:    retq
2880;
2881; AVX-LABEL: PR22377:
2882; AVX:       # BB#0: # %entry
2883; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3]
2884; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
2885; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm1
2886; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2887; AVX-NEXT:    retq
2888entry:
2889  %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
2890  %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2891  %r2 = fadd <4 x float> %s1, %s2
2892  %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2893  ret <4 x float> %s3
2894}
2895
2896define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) {
2897; SSE2-LABEL: PR22390:
2898; SSE2:       # BB#0: # %entry
2899; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2900; SSE2-NEXT:    movaps %xmm0, %xmm2
2901; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2902; SSE2-NEXT:    addps %xmm0, %xmm2
2903; SSE2-NEXT:    movaps %xmm2, %xmm0
2904; SSE2-NEXT:    retq
2905;
2906; SSSE3-LABEL: PR22390:
2907; SSSE3:       # BB#0: # %entry
2908; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2909; SSSE3-NEXT:    movaps %xmm0, %xmm2
2910; SSSE3-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2911; SSSE3-NEXT:    addps %xmm0, %xmm2
2912; SSSE3-NEXT:    movaps %xmm2, %xmm0
2913; SSSE3-NEXT:    retq
2914;
2915; SSE41-LABEL: PR22390:
2916; SSE41:       # BB#0: # %entry
2917; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2918; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2919; SSE41-NEXT:    addps %xmm1, %xmm0
2920; SSE41-NEXT:    retq
2921;
2922; AVX-LABEL: PR22390:
2923; AVX:       # BB#0: # %entry
2924; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2925; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2926; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2927; AVX-NEXT:    retq
2928entry:
2929  %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
2930  %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
2931  %r2 = fadd <4 x float> %s1, %s2
2932  ret <4 x float> %r2
2933}
2934
2935define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
2936; SSE2-LABEL: PR22412:
2937; SSE2:       # BB#0: # %entry
2938; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2939; SSE2-NEXT:    movapd %xmm2, %xmm0
2940; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
2941; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
2942; SSE2-NEXT:    movaps %xmm3, %xmm1
2943; SSE2-NEXT:    retq
2944;
2945; SSSE3-LABEL: PR22412:
2946; SSSE3:       # BB#0: # %entry
2947; SSSE3-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2948; SSSE3-NEXT:    movapd %xmm2, %xmm0
2949; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
2950; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
2951; SSSE3-NEXT:    movaps %xmm3, %xmm1
2952; SSSE3-NEXT:    retq
2953;
2954; SSE41-LABEL: PR22412:
2955; SSE41:       # BB#0: # %entry
2956; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
2957; SSE41-NEXT:    movapd %xmm0, %xmm1
2958; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[3,2]
2959; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[3,2]
2960; SSE41-NEXT:    movaps %xmm1, %xmm0
2961; SSE41-NEXT:    movaps %xmm3, %xmm1
2962; SSE41-NEXT:    retq
2963;
2964; AVX1-LABEL: PR22412:
2965; AVX1:       # BB#0: # %entry
2966; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
2967; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
2968; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6]
2969; AVX1-NEXT:    retq
2970;
2971; AVX2-LABEL: PR22412:
2972; AVX2:       # BB#0: # %entry
2973; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
2974; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
2975; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
2976; AVX2-NEXT:    retq
2977entry:
2978  %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2979  %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2>
2980  ret <8 x float> %s2
2981}
2982