1; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
2; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
3; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6;
7; Verify that the DAG combiner correctly folds bitwise operations across
8; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
9; basic and always-safe patterns. Also test that the DAG combiner will combine
10; target-specific shuffle instructions where reasonable.
11
12target triple = "x86_64-unknown-unknown"
13
14declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
15declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
16declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
17
18define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
19; ALL-LABEL: combine_pshufd1:
20; ALL:       # BB#0: # %entry
21; ALL-NEXT:    retq
22entry:
23  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
24  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
25  ret <4 x i32> %c
26}
27
28define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
29; ALL-LABEL: combine_pshufd2:
30; ALL:       # BB#0: # %entry
31; ALL-NEXT:    retq
32entry:
33  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
34  %b.cast = bitcast <4 x i32> %b to <8 x i16>
35  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
36  %c.cast = bitcast <8 x i16> %c to <4 x i32>
37  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
38  ret <4 x i32> %d
39}
40
41define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
42; ALL-LABEL: combine_pshufd3:
43; ALL:       # BB#0: # %entry
44; ALL-NEXT:    retq
45entry:
46  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
47  %b.cast = bitcast <4 x i32> %b to <8 x i16>
48  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
49  %c.cast = bitcast <8 x i16> %c to <4 x i32>
50  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
51  ret <4 x i32> %d
52}
53
54define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
55; SSE-LABEL: combine_pshufd4:
56; SSE:       # BB#0: # %entry
57; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
58; SSE-NEXT:    retq
59;
60; AVX-LABEL: combine_pshufd4:
61; AVX:       # BB#0: # %entry
62; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
63; AVX-NEXT:    retq
64entry:
65  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31)
66  %b.cast = bitcast <4 x i32> %b to <8 x i16>
67  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
68  %c.cast = bitcast <8 x i16> %c to <4 x i32>
69  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31)
70  ret <4 x i32> %d
71}
72
73define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
74; SSE-LABEL: combine_pshufd5:
75; SSE:       # BB#0: # %entry
76; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
77; SSE-NEXT:    retq
78;
79; AVX-LABEL: combine_pshufd5:
80; AVX:       # BB#0: # %entry
81; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
82; AVX-NEXT:    retq
83entry:
84  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76)
85  %b.cast = bitcast <4 x i32> %b to <8 x i16>
86  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
87  %c.cast = bitcast <8 x i16> %c to <4 x i32>
88  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76)
89  ret <4 x i32> %d
90}
91
92define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
93; SSE-LABEL: combine_pshufd6:
94; SSE:       # BB#0: # %entry
95; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
96; SSE-NEXT:    retq
97;
98; AVX-LABEL: combine_pshufd6:
99; AVX:       # BB#0: # %entry
100; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
101; AVX-NEXT:    retq
102entry:
103  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
104  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
105  ret <4 x i32> %c
106}
107
108define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
109; ALL-LABEL: combine_pshuflw1:
110; ALL:       # BB#0: # %entry
111; ALL-NEXT:    retq
112entry:
113  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
114  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
115  ret <8 x i16> %c
116}
117
118define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
119; ALL-LABEL: combine_pshuflw2:
120; ALL:       # BB#0: # %entry
121; ALL-NEXT:    retq
122entry:
123  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
124  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
125  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
126  ret <8 x i16> %d
127}
128
129define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
130; SSE-LABEL: combine_pshuflw3:
131; SSE:       # BB#0: # %entry
132; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
133; SSE-NEXT:    retq
134;
135; AVX-LABEL: combine_pshuflw3:
136; AVX:       # BB#0: # %entry
137; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
138; AVX-NEXT:    retq
139entry:
140  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
141  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
142  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
143  ret <8 x i16> %d
144}
145
146define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
147; SSE-LABEL: combine_pshufhw1:
148; SSE:       # BB#0: # %entry
149; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
150; SSE-NEXT:    retq
151;
152; AVX-LABEL: combine_pshufhw1:
153; AVX:       # BB#0: # %entry
154; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
155; AVX-NEXT:    retq
156entry:
157  %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
158  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
159  %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
160  ret <8 x i16> %d
161}
162
163define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
164; SSE-LABEL: combine_bitwise_ops_test1:
165; SSE:       # BB#0:
166; SSE-NEXT:    pand %xmm1, %xmm0
167; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
168; SSE-NEXT:    retq
169;
170; AVX-LABEL: combine_bitwise_ops_test1:
171; AVX:       # BB#0:
172; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
173; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
174; AVX-NEXT:    retq
175  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
176  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
177  %and = and <4 x i32> %shuf1, %shuf2
178  ret <4 x i32> %and
179}
180
181define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
182; SSE-LABEL: combine_bitwise_ops_test2:
183; SSE:       # BB#0:
184; SSE-NEXT:    por %xmm1, %xmm0
185; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
186; SSE-NEXT:    retq
187;
188; AVX-LABEL: combine_bitwise_ops_test2:
189; AVX:       # BB#0:
190; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
191; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
192; AVX-NEXT:    retq
193  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
194  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
195  %or = or <4 x i32> %shuf1, %shuf2
196  ret <4 x i32> %or
197}
198
199define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
200; SSE-LABEL: combine_bitwise_ops_test3:
201; SSE:       # BB#0:
202; SSE-NEXT:    pxor %xmm1, %xmm0
203; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
204; SSE-NEXT:    retq
205;
206; AVX-LABEL: combine_bitwise_ops_test3:
207; AVX:       # BB#0:
208; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
209; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
210; AVX-NEXT:    retq
211  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
212  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
213  %xor = xor <4 x i32> %shuf1, %shuf2
214  ret <4 x i32> %xor
215}
216
217define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
218; SSE-LABEL: combine_bitwise_ops_test4:
219; SSE:       # BB#0:
220; SSE-NEXT:    pand %xmm1, %xmm0
221; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
222; SSE-NEXT:    retq
223;
224; AVX-LABEL: combine_bitwise_ops_test4:
225; AVX:       # BB#0:
226; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
227; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
228; AVX-NEXT:    retq
229  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
230  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
231  %and = and <4 x i32> %shuf1, %shuf2
232  ret <4 x i32> %and
233}
234
235define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
236; SSE-LABEL: combine_bitwise_ops_test5:
237; SSE:       # BB#0:
238; SSE-NEXT:    por %xmm1, %xmm0
239; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
240; SSE-NEXT:    retq
241;
242; AVX-LABEL: combine_bitwise_ops_test5:
243; AVX:       # BB#0:
244; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
245; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
246; AVX-NEXT:    retq
247  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
248  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
249  %or = or <4 x i32> %shuf1, %shuf2
250  ret <4 x i32> %or
251}
252
253define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
254; SSE-LABEL: combine_bitwise_ops_test6:
255; SSE:       # BB#0:
256; SSE-NEXT:    pxor %xmm1, %xmm0
257; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
258; SSE-NEXT:    retq
259;
260; AVX-LABEL: combine_bitwise_ops_test6:
261; AVX:       # BB#0:
262; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
263; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
264; AVX-NEXT:    retq
265  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
266  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
267  %xor = xor <4 x i32> %shuf1, %shuf2
268  ret <4 x i32> %xor
269}
270
271
272; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
273; are not performing a swizzle operations.
274
275define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
276; SSE2-LABEL: combine_bitwise_ops_test1b:
277; SSE2:       # BB#0:
278; SSE2-NEXT:    pand %xmm1, %xmm0
279; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
280; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
281; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
282; SSE2-NEXT:    retq
283;
284; SSSE3-LABEL: combine_bitwise_ops_test1b:
285; SSSE3:       # BB#0:
286; SSSE3-NEXT:    pand %xmm1, %xmm0
287; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
288; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
289; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
290; SSSE3-NEXT:    retq
291;
292; SSE41-LABEL: combine_bitwise_ops_test1b:
293; SSE41:       # BB#0:
294; SSE41-NEXT:    pand %xmm1, %xmm0
295; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
296; SSE41-NEXT:    retq
297;
298; AVX1-LABEL: combine_bitwise_ops_test1b:
299; AVX1:       # BB#0:
300; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
301; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
302; AVX1-NEXT:    retq
303;
304; AVX2-LABEL: combine_bitwise_ops_test1b:
305; AVX2:       # BB#0:
306; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
307; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
308; AVX2-NEXT:    retq
309  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
310  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
311  %and = and <4 x i32> %shuf1, %shuf2
312  ret <4 x i32> %and
313}
314
315define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
316; SSE2-LABEL: combine_bitwise_ops_test2b:
317; SSE2:       # BB#0:
318; SSE2-NEXT:    por %xmm1, %xmm0
319; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
320; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
321; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
322; SSE2-NEXT:    retq
323;
324; SSSE3-LABEL: combine_bitwise_ops_test2b:
325; SSSE3:       # BB#0:
326; SSSE3-NEXT:    por %xmm1, %xmm0
327; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
328; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
329; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
330; SSSE3-NEXT:    retq
331;
332; SSE41-LABEL: combine_bitwise_ops_test2b:
333; SSE41:       # BB#0:
334; SSE41-NEXT:    por %xmm1, %xmm0
335; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
336; SSE41-NEXT:    retq
337;
338; AVX1-LABEL: combine_bitwise_ops_test2b:
339; AVX1:       # BB#0:
340; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
341; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
342; AVX1-NEXT:    retq
343;
344; AVX2-LABEL: combine_bitwise_ops_test2b:
345; AVX2:       # BB#0:
346; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
347; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
348; AVX2-NEXT:    retq
349  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
350  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
351  %or = or <4 x i32> %shuf1, %shuf2
352  ret <4 x i32> %or
353}
354
355define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
356; SSE2-LABEL: combine_bitwise_ops_test3b:
357; SSE2:       # BB#0:
358; SSE2-NEXT:    xorps %xmm1, %xmm0
359; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
360; SSE2-NEXT:    retq
361;
362; SSSE3-LABEL: combine_bitwise_ops_test3b:
363; SSSE3:       # BB#0:
364; SSSE3-NEXT:    xorps %xmm1, %xmm0
365; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
366; SSSE3-NEXT:    retq
367;
368; SSE41-LABEL: combine_bitwise_ops_test3b:
369; SSE41:       # BB#0:
370; SSE41-NEXT:    pxor %xmm1, %xmm0
371; SSE41-NEXT:    pxor %xmm1, %xmm1
372; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
373; SSE41-NEXT:    retq
374;
375; AVX1-LABEL: combine_bitwise_ops_test3b:
376; AVX1:       # BB#0:
377; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
378; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
379; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
380; AVX1-NEXT:    retq
381;
382; AVX2-LABEL: combine_bitwise_ops_test3b:
383; AVX2:       # BB#0:
384; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
385; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
386; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
387; AVX2-NEXT:    retq
388  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
389  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
390  %xor = xor <4 x i32> %shuf1, %shuf2
391  ret <4 x i32> %xor
392}
393
394define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
395; SSE2-LABEL: combine_bitwise_ops_test4b:
396; SSE2:       # BB#0:
397; SSE2-NEXT:    pand %xmm1, %xmm0
398; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
399; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
400; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
401; SSE2-NEXT:    retq
402;
403; SSSE3-LABEL: combine_bitwise_ops_test4b:
404; SSSE3:       # BB#0:
405; SSSE3-NEXT:    pand %xmm1, %xmm0
406; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
407; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
408; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
409; SSSE3-NEXT:    retq
410;
411; SSE41-LABEL: combine_bitwise_ops_test4b:
412; SSE41:       # BB#0:
413; SSE41-NEXT:    pand %xmm1, %xmm0
414; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
415; SSE41-NEXT:    retq
416;
417; AVX1-LABEL: combine_bitwise_ops_test4b:
418; AVX1:       # BB#0:
419; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
420; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
421; AVX1-NEXT:    retq
422;
423; AVX2-LABEL: combine_bitwise_ops_test4b:
424; AVX2:       # BB#0:
425; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
426; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
427; AVX2-NEXT:    retq
428  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
429  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
430  %and = and <4 x i32> %shuf1, %shuf2
431  ret <4 x i32> %and
432}
433
434define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
435; SSE2-LABEL: combine_bitwise_ops_test5b:
436; SSE2:       # BB#0:
437; SSE2-NEXT:    por %xmm1, %xmm0
438; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
439; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
440; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
441; SSE2-NEXT:    retq
442;
443; SSSE3-LABEL: combine_bitwise_ops_test5b:
444; SSSE3:       # BB#0:
445; SSSE3-NEXT:    por %xmm1, %xmm0
446; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
447; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
448; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
449; SSSE3-NEXT:    retq
450;
451; SSE41-LABEL: combine_bitwise_ops_test5b:
452; SSE41:       # BB#0:
453; SSE41-NEXT:    por %xmm1, %xmm0
454; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
455; SSE41-NEXT:    retq
456;
457; AVX1-LABEL: combine_bitwise_ops_test5b:
458; AVX1:       # BB#0:
459; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
460; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
461; AVX1-NEXT:    retq
462;
463; AVX2-LABEL: combine_bitwise_ops_test5b:
464; AVX2:       # BB#0:
465; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
466; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
467; AVX2-NEXT:    retq
468  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
469  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
470  %or = or <4 x i32> %shuf1, %shuf2
471  ret <4 x i32> %or
472}
473
474define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
475; SSE2-LABEL: combine_bitwise_ops_test6b:
476; SSE2:       # BB#0:
477; SSE2-NEXT:    xorps %xmm1, %xmm0
478; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
479; SSE2-NEXT:    retq
480;
481; SSSE3-LABEL: combine_bitwise_ops_test6b:
482; SSSE3:       # BB#0:
483; SSSE3-NEXT:    xorps %xmm1, %xmm0
484; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
485; SSSE3-NEXT:    retq
486;
487; SSE41-LABEL: combine_bitwise_ops_test6b:
488; SSE41:       # BB#0:
489; SSE41-NEXT:    pxor %xmm1, %xmm0
490; SSE41-NEXT:    pxor %xmm1, %xmm1
491; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
492; SSE41-NEXT:    retq
493;
494; AVX1-LABEL: combine_bitwise_ops_test6b:
495; AVX1:       # BB#0:
496; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
497; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
498; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
499; AVX1-NEXT:    retq
500;
501; AVX2-LABEL: combine_bitwise_ops_test6b:
502; AVX2:       # BB#0:
503; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
504; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
505; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
506; AVX2-NEXT:    retq
507  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
508  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
509  %xor = xor <4 x i32> %shuf1, %shuf2
510  ret <4 x i32> %xor
511}
512
513define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
514; SSE2-LABEL: combine_bitwise_ops_test1c:
515; SSE2:       # BB#0:
516; SSE2-NEXT:    pand %xmm1, %xmm0
517; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
518; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
519; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
520; SSE2-NEXT:    retq
521;
522; SSSE3-LABEL: combine_bitwise_ops_test1c:
523; SSSE3:       # BB#0:
524; SSSE3-NEXT:    pand %xmm1, %xmm0
525; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
526; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
527; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
528; SSSE3-NEXT:    retq
529;
530; SSE41-LABEL: combine_bitwise_ops_test1c:
531; SSE41:       # BB#0:
532; SSE41-NEXT:    pand %xmm1, %xmm0
533; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
534; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
535; SSE41-NEXT:    retq
536;
537; AVX1-LABEL: combine_bitwise_ops_test1c:
538; AVX1:       # BB#0:
539; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
540; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
541; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
542; AVX1-NEXT:    retq
543;
544; AVX2-LABEL: combine_bitwise_ops_test1c:
545; AVX2:       # BB#0:
546; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
547; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
548; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
549; AVX2-NEXT:    retq
550  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
551  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
552  %and = and <4 x i32> %shuf1, %shuf2
553  ret <4 x i32> %and
554}
555
556define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
557; SSE2-LABEL: combine_bitwise_ops_test2c:
558; SSE2:       # BB#0:
559; SSE2-NEXT:    por %xmm1, %xmm0
560; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
561; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
562; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
563; SSE2-NEXT:    retq
564;
565; SSSE3-LABEL: combine_bitwise_ops_test2c:
566; SSSE3:       # BB#0:
567; SSSE3-NEXT:    por %xmm1, %xmm0
568; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
569; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
570; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
571; SSSE3-NEXT:    retq
572;
573; SSE41-LABEL: combine_bitwise_ops_test2c:
574; SSE41:       # BB#0:
575; SSE41-NEXT:    por %xmm1, %xmm0
576; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
577; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
578; SSE41-NEXT:    retq
579;
580; AVX1-LABEL: combine_bitwise_ops_test2c:
581; AVX1:       # BB#0:
582; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
583; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
584; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
585; AVX1-NEXT:    retq
586;
587; AVX2-LABEL: combine_bitwise_ops_test2c:
588; AVX2:       # BB#0:
589; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
590; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
591; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
592; AVX2-NEXT:    retq
593  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
594  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
595  %or = or <4 x i32> %shuf1, %shuf2
596  ret <4 x i32> %or
597}
598
599define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
600; SSE2-LABEL: combine_bitwise_ops_test3c:
601; SSE2:       # BB#0:
602; SSE2-NEXT:    pxor %xmm1, %xmm0
603; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
604; SSE2-NEXT:    pxor %xmm1, %xmm1
605; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
606; SSE2-NEXT:    retq
607;
608; SSSE3-LABEL: combine_bitwise_ops_test3c:
609; SSSE3:       # BB#0:
610; SSSE3-NEXT:    pxor %xmm1, %xmm0
611; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
612; SSSE3-NEXT:    pxor %xmm1, %xmm1
613; SSSE3-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
614; SSSE3-NEXT:    retq
615;
616; SSE41-LABEL: combine_bitwise_ops_test3c:
617; SSE41:       # BB#0:
618; SSE41-NEXT:    pxor %xmm1, %xmm0
619; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
620; SSE41-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
621; SSE41-NEXT:    retq
622;
623; AVX-LABEL: combine_bitwise_ops_test3c:
624; AVX:       # BB#0:
625; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
626; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
627; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
628; AVX-NEXT:    retq
629  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
630  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
631  %xor = xor <4 x i32> %shuf1, %shuf2
632  ret <4 x i32> %xor
633}
634
635define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
636; SSE2-LABEL: combine_bitwise_ops_test4c:
637; SSE2:       # BB#0:
638; SSE2-NEXT:    pand %xmm1, %xmm0
639; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
640; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
641; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
642; SSE2-NEXT:    retq
643;
644; SSSE3-LABEL: combine_bitwise_ops_test4c:
645; SSSE3:       # BB#0:
646; SSSE3-NEXT:    pand %xmm1, %xmm0
647; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
648; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
649; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
650; SSSE3-NEXT:    retq
651;
652; SSE41-LABEL: combine_bitwise_ops_test4c:
653; SSE41:       # BB#0:
654; SSE41-NEXT:    pand %xmm1, %xmm0
655; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
656; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
657; SSE41-NEXT:    retq
658;
659; AVX1-LABEL: combine_bitwise_ops_test4c:
660; AVX1:       # BB#0:
661; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
662; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
663; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
664; AVX1-NEXT:    retq
665;
666; AVX2-LABEL: combine_bitwise_ops_test4c:
667; AVX2:       # BB#0:
668; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
669; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
670; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
671; AVX2-NEXT:    retq
672  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
673  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
674  %and = and <4 x i32> %shuf1, %shuf2
675  ret <4 x i32> %and
676}
677
678define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
679; SSE2-LABEL: combine_bitwise_ops_test5c:
680; SSE2:       # BB#0:
681; SSE2-NEXT:    por %xmm1, %xmm0
682; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
683; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
684; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
685; SSE2-NEXT:    retq
686;
687; SSSE3-LABEL: combine_bitwise_ops_test5c:
688; SSSE3:       # BB#0:
689; SSSE3-NEXT:    por %xmm1, %xmm0
690; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
691; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
692; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
693; SSSE3-NEXT:    retq
694;
695; SSE41-LABEL: combine_bitwise_ops_test5c:
696; SSE41:       # BB#0:
697; SSE41-NEXT:    por %xmm1, %xmm0
698; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
699; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
700; SSE41-NEXT:    retq
701;
702; AVX1-LABEL: combine_bitwise_ops_test5c:
703; AVX1:       # BB#0:
704; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
705; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
706; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
707; AVX1-NEXT:    retq
708;
709; AVX2-LABEL: combine_bitwise_ops_test5c:
710; AVX2:       # BB#0:
711; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
712; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
713; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
714; AVX2-NEXT:    retq
715  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
716  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
717  %or = or <4 x i32> %shuf1, %shuf2
718  ret <4 x i32> %or
719}
720
721define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
722; SSE2-LABEL: combine_bitwise_ops_test6c:
723; SSE2:       # BB#0:
724; SSE2-NEXT:    pxor %xmm1, %xmm0
725; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
726; SSE2-NEXT:    pxor %xmm0, %xmm0
727; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
728; SSE2-NEXT:    retq
729;
730; SSSE3-LABEL: combine_bitwise_ops_test6c:
731; SSSE3:       # BB#0:
732; SSSE3-NEXT:    pxor %xmm1, %xmm0
733; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
734; SSSE3-NEXT:    pxor %xmm0, %xmm0
735; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
736; SSSE3-NEXT:    retq
737;
738; SSE41-LABEL: combine_bitwise_ops_test6c:
739; SSE41:       # BB#0:
740; SSE41-NEXT:    pxor %xmm1, %xmm0
741; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
742; SSE41-NEXT:    pxor %xmm0, %xmm0
743; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
744; SSE41-NEXT:    retq
745;
746; AVX1-LABEL: combine_bitwise_ops_test6c:
747; AVX1:       # BB#0:
748; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
749; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
750; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
751; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
752; AVX1-NEXT:    retq
753;
754; AVX2-LABEL: combine_bitwise_ops_test6c:
755; AVX2:       # BB#0:
756; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
757; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
758; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
759; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
760; AVX2-NEXT:    retq
761  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
762  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
763  %xor = xor <4 x i32> %shuf1, %shuf2
764  ret <4 x i32> %xor
765}
766
767define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) {
768; SSE-LABEL: combine_nested_undef_test1:
769; SSE:       # BB#0:
770; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
771; SSE-NEXT:    retq
772;
773; AVX-LABEL: combine_nested_undef_test1:
774; AVX:       # BB#0:
775; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
776; AVX-NEXT:    retq
777  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
778  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
779  ret <4 x i32> %2
780}
781
782define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) {
783; SSE-LABEL: combine_nested_undef_test2:
784; SSE:       # BB#0:
785; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
786; SSE-NEXT:    retq
787;
788; AVX-LABEL: combine_nested_undef_test2:
789; AVX:       # BB#0:
790; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
791; AVX-NEXT:    retq
792  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
793  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
794  ret <4 x i32> %2
795}
796
797define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) {
798; SSE-LABEL: combine_nested_undef_test3:
799; SSE:       # BB#0:
800; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
801; SSE-NEXT:    retq
802;
803; AVX-LABEL: combine_nested_undef_test3:
804; AVX:       # BB#0:
805; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
806; AVX-NEXT:    retq
807  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
808  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
809  ret <4 x i32> %2
810}
811
812define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) {
813; SSE-LABEL: combine_nested_undef_test4:
814; SSE:       # BB#0:
815; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
816; SSE-NEXT:    retq
817;
818; AVX1-LABEL: combine_nested_undef_test4:
819; AVX1:       # BB#0:
820; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
821; AVX1-NEXT:    retq
822;
823; AVX2-LABEL: combine_nested_undef_test4:
824; AVX2:       # BB#0:
825; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
826; AVX2-NEXT:    retq
827  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
828  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
829  ret <4 x i32> %2
830}
831
832define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) {
833; SSE-LABEL: combine_nested_undef_test5:
834; SSE:       # BB#0:
835; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
836; SSE-NEXT:    retq
837;
838; AVX-LABEL: combine_nested_undef_test5:
839; AVX:       # BB#0:
840; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
841; AVX-NEXT:    retq
842  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
843  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
844  ret <4 x i32> %2
845}
846
847define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) {
848; SSE-LABEL: combine_nested_undef_test6:
849; SSE:       # BB#0:
850; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
851; SSE-NEXT:    retq
852;
853; AVX-LABEL: combine_nested_undef_test6:
854; AVX:       # BB#0:
855; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
856; AVX-NEXT:    retq
857  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
858  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
859  ret <4 x i32> %2
860}
861
862define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) {
863; SSE-LABEL: combine_nested_undef_test7:
864; SSE:       # BB#0:
865; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
866; SSE-NEXT:    retq
867;
868; AVX-LABEL: combine_nested_undef_test7:
869; AVX:       # BB#0:
870; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
871; AVX-NEXT:    retq
872  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
873  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
874  ret <4 x i32> %2
875}
876
877define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) {
878; SSE-LABEL: combine_nested_undef_test8:
879; SSE:       # BB#0:
880; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
881; SSE-NEXT:    retq
882;
883; AVX-LABEL: combine_nested_undef_test8:
884; AVX:       # BB#0:
885; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
886; AVX-NEXT:    retq
887  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
888  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
889  ret <4 x i32> %2
890}
891
892define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) {
893; SSE-LABEL: combine_nested_undef_test9:
894; SSE:       # BB#0:
895; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
896; SSE-NEXT:    retq
897;
898; AVX-LABEL: combine_nested_undef_test9:
899; AVX:       # BB#0:
900; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
901; AVX-NEXT:    retq
902  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
903  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
904  ret <4 x i32> %2
905}
906
907define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) {
908; SSE-LABEL: combine_nested_undef_test10:
909; SSE:       # BB#0:
910; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
911; SSE-NEXT:    retq
912;
913; AVX-LABEL: combine_nested_undef_test10:
914; AVX:       # BB#0:
915; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
916; AVX-NEXT:    retq
917  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
918  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
919  ret <4 x i32> %2
920}
921
922define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) {
923; SSE-LABEL: combine_nested_undef_test11:
924; SSE:       # BB#0:
925; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
926; SSE-NEXT:    retq
927;
928; AVX-LABEL: combine_nested_undef_test11:
929; AVX:       # BB#0:
930; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
931; AVX-NEXT:    retq
932  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
933  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
934  ret <4 x i32> %2
935}
936
937define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
938; SSE-LABEL: combine_nested_undef_test12:
939; SSE:       # BB#0:
940; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
941; SSE-NEXT:    retq
942;
943; AVX1-LABEL: combine_nested_undef_test12:
944; AVX1:       # BB#0:
945; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
946; AVX1-NEXT:    retq
947;
948; AVX2-LABEL: combine_nested_undef_test12:
949; AVX2:       # BB#0:
950; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
951; AVX2-NEXT:    retq
952  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
953  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
954  ret <4 x i32> %2
955}
956
957; The following pair of shuffles is folded into vector %A.
958define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
959; ALL-LABEL: combine_nested_undef_test13:
960; ALL:       # BB#0:
961; ALL-NEXT:    retq
962  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
963  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
964  ret <4 x i32> %2
965}
966
967; The following pair of shuffles is folded into vector %B.
968define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
969; SSE-LABEL: combine_nested_undef_test14:
970; SSE:       # BB#0:
971; SSE-NEXT:    movaps %xmm1, %xmm0
972; SSE-NEXT:    retq
973;
974; AVX-LABEL: combine_nested_undef_test14:
975; AVX:       # BB#0:
976; AVX-NEXT:    vmovaps %xmm1, %xmm0
977; AVX-NEXT:    retq
978  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
979  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
980  ret <4 x i32> %2
981}
982
983
984; Verify that we don't optimize the following cases. We expect more than one shuffle.
985;
986; FIXME: Many of these already don't make sense, and the rest should stop
987; making sense with th enew vector shuffle lowering. Revisit at least testing for
988; it.
989
990define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
991; SSE2-LABEL: combine_nested_undef_test15:
992; SSE2:       # BB#0:
993; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
994; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
995; SSE2-NEXT:    movaps %xmm1, %xmm0
996; SSE2-NEXT:    retq
997;
998; SSSE3-LABEL: combine_nested_undef_test15:
999; SSSE3:       # BB#0:
1000; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
1001; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
1002; SSSE3-NEXT:    movaps %xmm1, %xmm0
1003; SSSE3-NEXT:    retq
1004;
1005; SSE41-LABEL: combine_nested_undef_test15:
1006; SSE41:       # BB#0:
1007; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
1008; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1009; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1010; SSE41-NEXT:    retq
1011;
1012; AVX1-LABEL: combine_nested_undef_test15:
1013; AVX1:       # BB#0:
1014; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
1015; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1016; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1017; AVX1-NEXT:    retq
1018;
1019; AVX2-LABEL: combine_nested_undef_test15:
1020; AVX2:       # BB#0:
1021; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
1022; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1023; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1024; AVX2-NEXT:    retq
1025  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
1026  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1027  ret <4 x i32> %2
1028}
1029
1030define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
1031; SSE2-LABEL: combine_nested_undef_test16:
1032; SSE2:       # BB#0:
1033; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
1034; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
1035; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1036; SSE2-NEXT:    retq
1037;
1038; SSSE3-LABEL: combine_nested_undef_test16:
1039; SSSE3:       # BB#0:
1040; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
1041; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
1042; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1043; SSSE3-NEXT:    retq
1044;
1045; SSE41-LABEL: combine_nested_undef_test16:
1046; SSE41:       # BB#0:
1047; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1048; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1049; SSE41-NEXT:    retq
1050;
1051; AVX1-LABEL: combine_nested_undef_test16:
1052; AVX1:       # BB#0:
1053; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1054; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1055; AVX1-NEXT:    retq
1056;
1057; AVX2-LABEL: combine_nested_undef_test16:
1058; AVX2:       # BB#0:
1059; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1060; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1061; AVX2-NEXT:    retq
1062  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1063  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1064  ret <4 x i32> %2
1065}
1066
1067define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
1068; SSE2-LABEL: combine_nested_undef_test17:
1069; SSE2:       # BB#0:
1070; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
1071; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
1072; SSE2-NEXT:    retq
1073;
1074; SSSE3-LABEL: combine_nested_undef_test17:
1075; SSSE3:       # BB#0:
1076; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
1077; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
1078; SSSE3-NEXT:    retq
1079;
1080; SSE41-LABEL: combine_nested_undef_test17:
1081; SSE41:       # BB#0:
1082; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
1083; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1084; SSE41-NEXT:    retq
1085;
1086; AVX1-LABEL: combine_nested_undef_test17:
1087; AVX1:       # BB#0:
1088; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
1089; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1090; AVX1-NEXT:    retq
1091;
1092; AVX2-LABEL: combine_nested_undef_test17:
1093; AVX2:       # BB#0:
1094; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1095; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1096; AVX2-NEXT:    retq
1097  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
1098  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1099  ret <4 x i32> %2
1100}
1101
1102define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
1103; SSE-LABEL: combine_nested_undef_test18:
1104; SSE:       # BB#0:
1105; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
1106; SSE-NEXT:    retq
1107;
1108; AVX-LABEL: combine_nested_undef_test18:
1109; AVX:       # BB#0:
1110; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
1111; AVX-NEXT:    retq
1112  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1113  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
1114  ret <4 x i32> %2
1115}
1116
1117define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
1118; SSE2-LABEL: combine_nested_undef_test19:
1119; SSE2:       # BB#0:
1120; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1121; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
1122; SSE2-NEXT:    retq
1123;
1124; SSSE3-LABEL: combine_nested_undef_test19:
1125; SSSE3:       # BB#0:
1126; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1127; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
1128; SSSE3-NEXT:    retq
1129;
1130; SSE41-LABEL: combine_nested_undef_test19:
1131; SSE41:       # BB#0:
1132; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1133; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
1134; SSE41-NEXT:    retq
1135;
1136; AVX1-LABEL: combine_nested_undef_test19:
1137; AVX1:       # BB#0:
1138; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1139; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
1140; AVX1-NEXT:    retq
1141;
1142; AVX2-LABEL: combine_nested_undef_test19:
1143; AVX2:       # BB#0:
1144; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1145; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
1146; AVX2-NEXT:    retq
1147  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
1148  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
1149  ret <4 x i32> %2
1150}
1151
1152define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
1153; SSE2-LABEL: combine_nested_undef_test20:
1154; SSE2:       # BB#0:
1155; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
1156; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1157; SSE2-NEXT:    movaps %xmm1, %xmm0
1158; SSE2-NEXT:    retq
1159;
1160; SSSE3-LABEL: combine_nested_undef_test20:
1161; SSSE3:       # BB#0:
1162; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
1163; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1164; SSSE3-NEXT:    movaps %xmm1, %xmm0
1165; SSSE3-NEXT:    retq
1166;
1167; SSE41-LABEL: combine_nested_undef_test20:
1168; SSE41:       # BB#0:
1169; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1170; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1171; SSE41-NEXT:    retq
1172;
1173; AVX1-LABEL: combine_nested_undef_test20:
1174; AVX1:       # BB#0:
1175; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1176; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1177; AVX1-NEXT:    retq
1178;
1179; AVX2-LABEL: combine_nested_undef_test20:
1180; AVX2:       # BB#0:
1181; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1182; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1183; AVX2-NEXT:    retq
1184  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
1185  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1186  ret <4 x i32> %2
1187}
1188
1189define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
1190; SSE2-LABEL: combine_nested_undef_test21:
1191; SSE2:       # BB#0:
1192; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1193; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
1194; SSE2-NEXT:    retq
1195;
1196; SSSE3-LABEL: combine_nested_undef_test21:
1197; SSSE3:       # BB#0:
1198; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1199; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
1200; SSSE3-NEXT:    retq
1201;
1202; SSE41-LABEL: combine_nested_undef_test21:
1203; SSE41:       # BB#0:
1204; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1205; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1206; SSE41-NEXT:    retq
1207;
1208; AVX1-LABEL: combine_nested_undef_test21:
1209; AVX1:       # BB#0:
1210; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1211; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1212; AVX1-NEXT:    retq
1213;
1214; AVX2-LABEL: combine_nested_undef_test21:
1215; AVX2:       # BB#0:
1216; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1217; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
1218; AVX2-NEXT:    retq
1219  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
1220  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1221  ret <4 x i32> %2
1222}
1223
1224
1225; Test that we correctly combine shuffles according to rule
1226;  shuffle(shuffle(x, y), undef) -> shuffle(y, undef)
1227
1228define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) {
1229; SSE-LABEL: combine_nested_undef_test22:
1230; SSE:       # BB#0:
1231; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
1232; SSE-NEXT:    retq
1233;
1234; AVX-LABEL: combine_nested_undef_test22:
1235; AVX:       # BB#0:
1236; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
1237; AVX-NEXT:    retq
1238  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1239  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3>
1240  ret <4 x i32> %2
1241}
1242
1243define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) {
1244; SSE-LABEL: combine_nested_undef_test23:
1245; SSE:       # BB#0:
1246; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
1247; SSE-NEXT:    retq
1248;
1249; AVX-LABEL: combine_nested_undef_test23:
1250; AVX:       # BB#0:
1251; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
1252; AVX-NEXT:    retq
1253  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1254  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1255  ret <4 x i32> %2
1256}
1257
1258define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) {
1259; SSE-LABEL: combine_nested_undef_test24:
1260; SSE:       # BB#0:
1261; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
1262; SSE-NEXT:    retq
1263;
1264; AVX-LABEL: combine_nested_undef_test24:
1265; AVX:       # BB#0:
1266; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
1267; AVX-NEXT:    retq
1268  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1269  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4>
1270  ret <4 x i32> %2
1271}
1272
1273define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) {
1274; SSE-LABEL: combine_nested_undef_test25:
1275; SSE:       # BB#0:
1276; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1277; SSE-NEXT:    retq
1278;
1279; AVX1-LABEL: combine_nested_undef_test25:
1280; AVX1:       # BB#0:
1281; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1282; AVX1-NEXT:    retq
1283;
1284; AVX2-LABEL: combine_nested_undef_test25:
1285; AVX2:       # BB#0:
1286; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
1287; AVX2-NEXT:    retq
1288  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4>
1289  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1>
1290  ret <4 x i32> %2
1291}
1292
1293define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) {
1294; SSE-LABEL: combine_nested_undef_test26:
1295; SSE:       # BB#0:
1296; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1297; SSE-NEXT:    retq
1298;
1299; AVX-LABEL: combine_nested_undef_test26:
1300; AVX:       # BB#0:
1301; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1302; AVX-NEXT:    retq
1303  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7>
1304  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
1305  ret <4 x i32> %2
1306}
1307
1308define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) {
1309; SSE-LABEL: combine_nested_undef_test27:
1310; SSE:       # BB#0:
1311; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1312; SSE-NEXT:    retq
1313;
1314; AVX1-LABEL: combine_nested_undef_test27:
1315; AVX1:       # BB#0:
1316; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1317; AVX1-NEXT:    retq
1318;
1319; AVX2-LABEL: combine_nested_undef_test27:
1320; AVX2:       # BB#0:
1321; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
1322; AVX2-NEXT:    retq
1323  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4>
1324  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
1325  ret <4 x i32> %2
1326}
1327
1328define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
1329; SSE-LABEL: combine_nested_undef_test28:
1330; SSE:       # BB#0:
1331; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
1332; SSE-NEXT:    retq
1333;
1334; AVX-LABEL: combine_nested_undef_test28:
1335; AVX:       # BB#0:
1336; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
1337; AVX-NEXT:    retq
1338  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
1339  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2>
1340  ret <4 x i32> %2
1341}
1342
1343define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
1344; SSE-LABEL: combine_test1:
1345; SSE:       # BB#0:
1346; SSE-NEXT:    movaps %xmm1, %xmm0
1347; SSE-NEXT:    retq
1348;
1349; AVX-LABEL: combine_test1:
1350; AVX:       # BB#0:
1351; AVX-NEXT:    vmovaps %xmm1, %xmm0
1352; AVX-NEXT:    retq
1353  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1354  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1355  ret <4 x float> %2
1356}
1357
1358define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
1359; SSE2-LABEL: combine_test2:
1360; SSE2:       # BB#0:
1361; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1362; SSE2-NEXT:    movaps %xmm1, %xmm0
1363; SSE2-NEXT:    retq
1364;
1365; SSSE3-LABEL: combine_test2:
1366; SSSE3:       # BB#0:
1367; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1368; SSSE3-NEXT:    movaps %xmm1, %xmm0
1369; SSSE3-NEXT:    retq
1370;
1371; SSE41-LABEL: combine_test2:
1372; SSE41:       # BB#0:
1373; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1374; SSE41-NEXT:    retq
1375;
1376; AVX-LABEL: combine_test2:
1377; AVX:       # BB#0:
1378; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1379; AVX-NEXT:    retq
1380  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1381  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1382  ret <4 x float> %2
1383}
1384
1385define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
1386; SSE-LABEL: combine_test3:
1387; SSE:       # BB#0:
1388; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1389; SSE-NEXT:    retq
1390;
1391; AVX-LABEL: combine_test3:
1392; AVX:       # BB#0:
1393; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1394; AVX-NEXT:    retq
1395  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1396  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1397  ret <4 x float> %2
1398}
1399
1400define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
1401; SSE-LABEL: combine_test4:
1402; SSE:       # BB#0:
1403; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1404; SSE-NEXT:    movapd %xmm1, %xmm0
1405; SSE-NEXT:    retq
1406;
1407; AVX-LABEL: combine_test4:
1408; AVX:       # BB#0:
1409; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1410; AVX-NEXT:    retq
1411  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1412  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1413  ret <4 x float> %2
1414}
1415
1416define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
1417; SSE2-LABEL: combine_test5:
1418; SSE2:       # BB#0:
1419; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1420; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1421; SSE2-NEXT:    retq
1422;
1423; SSSE3-LABEL: combine_test5:
1424; SSSE3:       # BB#0:
1425; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1426; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1427; SSSE3-NEXT:    retq
1428;
1429; SSE41-LABEL: combine_test5:
1430; SSE41:       # BB#0:
1431; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1432; SSE41-NEXT:    retq
1433;
1434; AVX-LABEL: combine_test5:
1435; AVX:       # BB#0:
1436; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1437; AVX-NEXT:    retq
1438  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1439  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1440  ret <4 x float> %2
1441}
1442
1443define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
1444; SSE-LABEL: combine_test6:
1445; SSE:       # BB#0:
1446; SSE-NEXT:    movaps %xmm1, %xmm0
1447; SSE-NEXT:    retq
1448;
1449; AVX-LABEL: combine_test6:
1450; AVX:       # BB#0:
1451; AVX-NEXT:    vmovaps %xmm1, %xmm0
1452; AVX-NEXT:    retq
1453  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1454  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1455  ret <4 x i32> %2
1456}
1457
1458define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
1459; SSE2-LABEL: combine_test7:
1460; SSE2:       # BB#0:
1461; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1462; SSE2-NEXT:    movaps %xmm1, %xmm0
1463; SSE2-NEXT:    retq
1464;
1465; SSSE3-LABEL: combine_test7:
1466; SSSE3:       # BB#0:
1467; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1468; SSSE3-NEXT:    movaps %xmm1, %xmm0
1469; SSSE3-NEXT:    retq
1470;
1471; SSE41-LABEL: combine_test7:
1472; SSE41:       # BB#0:
1473; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1474; SSE41-NEXT:    retq
1475;
1476; AVX1-LABEL: combine_test7:
1477; AVX1:       # BB#0:
1478; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1479; AVX1-NEXT:    retq
1480;
1481; AVX2-LABEL: combine_test7:
1482; AVX2:       # BB#0:
1483; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1484; AVX2-NEXT:    retq
1485  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1486  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1487  ret <4 x i32> %2
1488}
1489
1490define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) {
1491; SSE-LABEL: combine_test8:
1492; SSE:       # BB#0:
1493; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1494; SSE-NEXT:    retq
1495;
1496; AVX-LABEL: combine_test8:
1497; AVX:       # BB#0:
1498; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1499; AVX-NEXT:    retq
1500  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1501  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1502  ret <4 x i32> %2
1503}
1504
1505define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
1506; SSE-LABEL: combine_test9:
1507; SSE:       # BB#0:
1508; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1509; SSE-NEXT:    movdqa %xmm1, %xmm0
1510; SSE-NEXT:    retq
1511;
1512; AVX-LABEL: combine_test9:
1513; AVX:       # BB#0:
1514; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1515; AVX-NEXT:    retq
1516  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1517  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1518  ret <4 x i32> %2
1519}
1520
1521define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
1522; SSE2-LABEL: combine_test10:
1523; SSE2:       # BB#0:
1524; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1525; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1526; SSE2-NEXT:    retq
1527;
1528; SSSE3-LABEL: combine_test10:
1529; SSSE3:       # BB#0:
1530; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1531; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1532; SSSE3-NEXT:    retq
1533;
1534; SSE41-LABEL: combine_test10:
1535; SSE41:       # BB#0:
1536; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1537; SSE41-NEXT:    retq
1538;
1539; AVX1-LABEL: combine_test10:
1540; AVX1:       # BB#0:
1541; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1542; AVX1-NEXT:    retq
1543;
1544; AVX2-LABEL: combine_test10:
1545; AVX2:       # BB#0:
1546; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1547; AVX2-NEXT:    retq
1548  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1549  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1550  ret <4 x i32> %2
1551}
1552
1553define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
1554; ALL-LABEL: combine_test11:
1555; ALL:       # BB#0:
1556; ALL-NEXT:    retq
1557  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1558  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1559  ret <4 x float> %2
1560}
1561
1562define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
1563; SSE2-LABEL: combine_test12:
1564; SSE2:       # BB#0:
1565; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1566; SSE2-NEXT:    movaps %xmm1, %xmm0
1567; SSE2-NEXT:    retq
1568;
1569; SSSE3-LABEL: combine_test12:
1570; SSSE3:       # BB#0:
1571; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1572; SSSE3-NEXT:    movaps %xmm1, %xmm0
1573; SSSE3-NEXT:    retq
1574;
1575; SSE41-LABEL: combine_test12:
1576; SSE41:       # BB#0:
1577; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1578; SSE41-NEXT:    retq
1579;
1580; AVX-LABEL: combine_test12:
1581; AVX:       # BB#0:
1582; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1583; AVX-NEXT:    retq
1584  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1585  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1586  ret <4 x float> %2
1587}
1588
1589define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) {
1590; SSE-LABEL: combine_test13:
1591; SSE:       # BB#0:
1592; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1593; SSE-NEXT:    retq
1594;
1595; AVX-LABEL: combine_test13:
1596; AVX:       # BB#0:
1597; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1598; AVX-NEXT:    retq
1599  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1600  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1601  ret <4 x float> %2
1602}
1603
1604define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
1605; SSE-LABEL: combine_test14:
1606; SSE:       # BB#0:
1607; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1608; SSE-NEXT:    retq
1609;
1610; AVX-LABEL: combine_test14:
1611; AVX:       # BB#0:
1612; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1613; AVX-NEXT:    retq
1614  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1615  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1616  ret <4 x float> %2
1617}
1618
1619define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
1620; SSE2-LABEL: combine_test15:
1621; SSE2:       # BB#0:
1622; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1623; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1624; SSE2-NEXT:    retq
1625;
1626; SSSE3-LABEL: combine_test15:
1627; SSSE3:       # BB#0:
1628; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1629; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1630; SSSE3-NEXT:    retq
1631;
1632; SSE41-LABEL: combine_test15:
1633; SSE41:       # BB#0:
1634; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1635; SSE41-NEXT:    retq
1636;
1637; AVX-LABEL: combine_test15:
1638; AVX:       # BB#0:
1639; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1640; AVX-NEXT:    retq
1641  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1642  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1643  ret <4 x float> %2
1644}
1645
1646define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
1647; ALL-LABEL: combine_test16:
1648; ALL:       # BB#0:
1649; ALL-NEXT:    retq
1650  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1651  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1652  ret <4 x i32> %2
1653}
1654
1655define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
1656; SSE2-LABEL: combine_test17:
1657; SSE2:       # BB#0:
1658; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1659; SSE2-NEXT:    movaps %xmm1, %xmm0
1660; SSE2-NEXT:    retq
1661;
1662; SSSE3-LABEL: combine_test17:
1663; SSSE3:       # BB#0:
1664; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1665; SSSE3-NEXT:    movaps %xmm1, %xmm0
1666; SSSE3-NEXT:    retq
1667;
1668; SSE41-LABEL: combine_test17:
1669; SSE41:       # BB#0:
1670; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1671; SSE41-NEXT:    retq
1672;
1673; AVX1-LABEL: combine_test17:
1674; AVX1:       # BB#0:
1675; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1676; AVX1-NEXT:    retq
1677;
1678; AVX2-LABEL: combine_test17:
1679; AVX2:       # BB#0:
1680; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1681; AVX2-NEXT:    retq
1682  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1683  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1684  ret <4 x i32> %2
1685}
1686
1687define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) {
1688; SSE-LABEL: combine_test18:
1689; SSE:       # BB#0:
1690; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1691; SSE-NEXT:    retq
1692;
1693; AVX-LABEL: combine_test18:
1694; AVX:       # BB#0:
1695; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1696; AVX-NEXT:    retq
1697  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1698  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1699  ret <4 x i32> %2
1700}
1701
1702define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
1703; SSE-LABEL: combine_test19:
1704; SSE:       # BB#0:
1705; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1706; SSE-NEXT:    retq
1707;
1708; AVX-LABEL: combine_test19:
1709; AVX:       # BB#0:
1710; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1711; AVX-NEXT:    retq
1712  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1713  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1714  ret <4 x i32> %2
1715}
1716
1717define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
1718; SSE2-LABEL: combine_test20:
1719; SSE2:       # BB#0:
1720; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1721; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1722; SSE2-NEXT:    retq
1723;
1724; SSSE3-LABEL: combine_test20:
1725; SSSE3:       # BB#0:
1726; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1727; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1728; SSSE3-NEXT:    retq
1729;
1730; SSE41-LABEL: combine_test20:
1731; SSE41:       # BB#0:
1732; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1733; SSE41-NEXT:    retq
1734;
1735; AVX1-LABEL: combine_test20:
1736; AVX1:       # BB#0:
1737; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1738; AVX1-NEXT:    retq
1739;
1740; AVX2-LABEL: combine_test20:
1741; AVX2:       # BB#0:
1742; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1743; AVX2-NEXT:    retq
1744  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1745  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1746  ret <4 x i32> %2
1747}
1748
1749define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) {
1750; SSE-LABEL: combine_test21:
1751; SSE:       # BB#0:
1752; SSE-NEXT:    movdqa %xmm0, %xmm2
1753; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1754; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1755; SSE-NEXT:    movdqa %xmm2, (%rdi)
1756; SSE-NEXT:    retq
1757;
1758; AVX1-LABEL: combine_test21:
1759; AVX1:       # BB#0:
1760; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1761; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0]
1762; AVX1-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1763; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
1764; AVX1-NEXT:    vzeroupper
1765; AVX1-NEXT:    retq
1766;
1767; AVX2-LABEL: combine_test21:
1768; AVX2:       # BB#0:
1769; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1770; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0]
1771; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1772; AVX2-NEXT:    vmovdqa %xmm2, (%rdi)
1773; AVX2-NEXT:    vzeroupper
1774; AVX2-NEXT:    retq
1775  %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1776  %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1777  store <4 x i32> %1, <4 x i32>* %ptr, align 16
1778  ret <4 x i32> %2
1779}
1780
1781define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) {
1782; SSE-LABEL: combine_test22:
1783; SSE:       # BB#0:
1784; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
1785; SSE-NEXT:    movhpd (%rsi), %xmm0
1786; SSE-NEXT:    retq
1787;
1788; AVX-LABEL: combine_test22:
1789; AVX:       # BB#0:
1790; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1791; AVX-NEXT:    vmovhpd (%rsi), %xmm0, %xmm0
1792; AVX-NEXT:    retq
1793; Current AVX2 lowering of this is still awful, not adding a test case.
1794  %1 = load <2 x float>, <2 x float>* %a, align 8
1795  %2 = load <2 x float>, <2 x float>* %b, align 8
1796  %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1797  ret <8 x float> %3
1798}
1799
1800; Check some negative cases.
1801; FIXME: Do any of these really make sense? Are they redundant with the above tests?
1802
1803define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
1804; SSE-LABEL: combine_test1b:
1805; SSE:       # BB#0:
1806; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
1807; SSE-NEXT:    movaps %xmm1, %xmm0
1808; SSE-NEXT:    retq
1809;
1810; AVX-LABEL: combine_test1b:
1811; AVX:       # BB#0:
1812; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0]
1813; AVX-NEXT:    retq
1814  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1815  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0>
1816  ret <4 x float> %2
1817}
1818
1819define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
1820; SSE2-LABEL: combine_test2b:
1821; SSE2:       # BB#0:
1822; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0,0]
1823; SSE2-NEXT:    movaps %xmm1, %xmm0
1824; SSE2-NEXT:    retq
1825;
1826; SSSE3-LABEL: combine_test2b:
1827; SSSE3:       # BB#0:
1828; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
1829; SSSE3-NEXT:    retq
1830;
1831; SSE41-LABEL: combine_test2b:
1832; SSE41:       # BB#0:
1833; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
1834; SSE41-NEXT:    retq
1835;
1836; AVX-LABEL: combine_test2b:
1837; AVX:       # BB#0:
1838; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm1[0,0]
1839; AVX-NEXT:    retq
1840  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1841  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5>
1842  ret <4 x float> %2
1843}
1844
1845define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
1846; SSE2-LABEL: combine_test3b:
1847; SSE2:       # BB#0:
1848; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1849; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1850; SSE2-NEXT:    retq
1851;
1852; SSSE3-LABEL: combine_test3b:
1853; SSSE3:       # BB#0:
1854; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1855; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1856; SSSE3-NEXT:    retq
1857;
1858; SSE41-LABEL: combine_test3b:
1859; SSE41:       # BB#0:
1860; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1861; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1862; SSE41-NEXT:    retq
1863;
1864; AVX-LABEL: combine_test3b:
1865; AVX:       # BB#0:
1866; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1867; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1868; AVX-NEXT:    retq
1869  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
1870  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>
1871  ret <4 x float> %2
1872}
1873
1874define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
1875; SSE-LABEL: combine_test4b:
1876; SSE:       # BB#0:
1877; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
1878; SSE-NEXT:    movaps %xmm1, %xmm0
1879; SSE-NEXT:    retq
1880;
1881; AVX-LABEL: combine_test4b:
1882; AVX:       # BB#0:
1883; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3]
1884; AVX-NEXT:    retq
1885  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1886  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7>
1887  ret <4 x float> %2
1888}
1889
1890
1891; Verify that we correctly fold shuffles even when we use illegal vector types.
1892
1893define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
1894; SSE2-LABEL: combine_test1c:
1895; SSE2:       # BB#0:
1896; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1897; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1898; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1899; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1900; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1901; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1902; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1903; SSE2-NEXT:    retq
1904;
1905; SSSE3-LABEL: combine_test1c:
1906; SSSE3:       # BB#0:
1907; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1908; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1909; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1910; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1911; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1912; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1913; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1914; SSSE3-NEXT:    retq
1915;
1916; SSE41-LABEL: combine_test1c:
1917; SSE41:       # BB#0:
1918; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1919; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1920; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
1921; SSE41-NEXT:    retq
1922;
1923; AVX1-LABEL: combine_test1c:
1924; AVX1:       # BB#0:
1925; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1926; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1927; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1928; AVX1-NEXT:    retq
1929;
1930; AVX2-LABEL: combine_test1c:
1931; AVX2:       # BB#0:
1932; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1933; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1934; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1935; AVX2-NEXT:    retq
1936  %A = load <4 x i8>, <4 x i8>* %a
1937  %B = load <4 x i8>, <4 x i8>* %b
1938  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1939  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1940  ret <4 x i8> %2
1941}
1942
1943define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
1944; SSE2-LABEL: combine_test2c:
1945; SSE2:       # BB#0:
1946; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1947; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1948; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1949; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1950; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1951; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1952; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1953; SSE2-NEXT:    retq
1954;
1955; SSSE3-LABEL: combine_test2c:
1956; SSSE3:       # BB#0:
1957; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1958; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1959; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1960; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1961; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1962; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1963; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1964; SSSE3-NEXT:    retq
1965;
1966; SSE41-LABEL: combine_test2c:
1967; SSE41:       # BB#0:
1968; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1969; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1970; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1971; SSE41-NEXT:    retq
1972;
1973; AVX-LABEL: combine_test2c:
1974; AVX:       # BB#0:
1975; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1976; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1977; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1978; AVX-NEXT:    retq
1979  %A = load <4 x i8>, <4 x i8>* %a
1980  %B = load <4 x i8>, <4 x i8>* %b
1981  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5>
1982  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1983  ret <4 x i8> %2
1984}
1985
1986define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
1987; SSE2-LABEL: combine_test3c:
1988; SSE2:       # BB#0:
1989; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1990; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1991; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1992; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1993; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1994; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1995; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1996; SSE2-NEXT:    retq
1997;
1998; SSSE3-LABEL: combine_test3c:
1999; SSSE3:       # BB#0:
2000; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2001; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2002; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2003; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2004; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2005; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2006; SSSE3-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2007; SSSE3-NEXT:    retq
2008;
2009; SSE41-LABEL: combine_test3c:
2010; SSE41:       # BB#0:
2011; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2012; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2013; SSE41-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2014; SSE41-NEXT:    retq
2015;
2016; AVX-LABEL: combine_test3c:
2017; AVX:       # BB#0:
2018; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2019; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2020; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2021; AVX-NEXT:    retq
2022  %A = load <4 x i8>, <4 x i8>* %a
2023  %B = load <4 x i8>, <4 x i8>* %b
2024  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2025  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2026  ret <4 x i8> %2
2027}
2028
2029define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
2030; SSE2-LABEL: combine_test4c:
2031; SSE2:       # BB#0:
2032; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2033; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2034; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2035; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2036; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2037; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2038; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
2039; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
2040; SSE2-NEXT:    retq
2041;
2042; SSSE3-LABEL: combine_test4c:
2043; SSSE3:       # BB#0:
2044; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2045; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2046; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2047; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2048; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2049; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2050; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
2051; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
2052; SSSE3-NEXT:    retq
2053;
2054; SSE41-LABEL: combine_test4c:
2055; SSE41:       # BB#0:
2056; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2057; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2058; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
2059; SSE41-NEXT:    retq
2060;
2061; AVX1-LABEL: combine_test4c:
2062; AVX1:       # BB#0:
2063; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2064; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2065; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
2066; AVX1-NEXT:    retq
2067;
2068; AVX2-LABEL: combine_test4c:
2069; AVX2:       # BB#0:
2070; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2071; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2072; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
2073; AVX2-NEXT:    retq
2074  %A = load <4 x i8>, <4 x i8>* %a
2075  %B = load <4 x i8>, <4 x i8>* %b
2076  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
2077  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
2078  ret <4 x i8> %2
2079}
2080
2081
2082; The following test cases are generated from this C++ code
2083;
2084;__m128 blend_01(__m128 a, __m128 b)
2085;{
2086;  __m128 s = a;
2087;  s = _mm_blend_ps( s, b, 1<<0 );
2088;  s = _mm_blend_ps( s, b, 1<<1 );
2089;  return s;
2090;}
2091;
2092;__m128 blend_02(__m128 a, __m128 b)
2093;{
2094;  __m128 s = a;
2095;  s = _mm_blend_ps( s, b, 1<<0 );
2096;  s = _mm_blend_ps( s, b, 1<<2 );
2097;  return s;
2098;}
2099;
2100;__m128 blend_123(__m128 a, __m128 b)
2101;{
2102;  __m128 s = a;
2103;  s = _mm_blend_ps( s, b, 1<<1 );
2104;  s = _mm_blend_ps( s, b, 1<<2 );
2105;  s = _mm_blend_ps( s, b, 1<<3 );
2106;  return s;
2107;}
2108
2109; Ideally, we should collapse the following shuffles into a single one.
2110
2111define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
2112; SSE2-LABEL: combine_blend_01:
2113; SSE2:       # BB#0:
2114; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2115; SSE2-NEXT:    retq
2116;
2117; SSSE3-LABEL: combine_blend_01:
2118; SSSE3:       # BB#0:
2119; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2120; SSSE3-NEXT:    retq
2121;
2122; SSE41-LABEL: combine_blend_01:
2123; SSE41:       # BB#0:
2124; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2125; SSE41-NEXT:    retq
2126;
2127; AVX-LABEL: combine_blend_01:
2128; AVX:       # BB#0:
2129; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2130; AVX-NEXT:    retq
2131  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
2132  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
2133  ret <4 x float> %shuffle6
2134}
2135
2136define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
2137; SSE2-LABEL: combine_blend_02:
2138; SSE2:       # BB#0:
2139; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
2140; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
2141; SSE2-NEXT:    movaps %xmm1, %xmm0
2142; SSE2-NEXT:    retq
2143;
2144; SSSE3-LABEL: combine_blend_02:
2145; SSSE3:       # BB#0:
2146; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
2147; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
2148; SSSE3-NEXT:    movaps %xmm1, %xmm0
2149; SSSE3-NEXT:    retq
2150;
2151; SSE41-LABEL: combine_blend_02:
2152; SSE41:       # BB#0:
2153; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2154; SSE41-NEXT:    retq
2155;
2156; AVX-LABEL: combine_blend_02:
2157; AVX:       # BB#0:
2158; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2159; AVX-NEXT:    retq
2160  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3>
2161  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
2162  ret <4 x float> %shuffle6
2163}
2164
2165define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
2166; SSE2-LABEL: combine_blend_123:
2167; SSE2:       # BB#0:
2168; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2169; SSE2-NEXT:    movaps %xmm1, %xmm0
2170; SSE2-NEXT:    retq
2171;
2172; SSSE3-LABEL: combine_blend_123:
2173; SSSE3:       # BB#0:
2174; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2175; SSSE3-NEXT:    movaps %xmm1, %xmm0
2176; SSSE3-NEXT:    retq
2177;
2178; SSE41-LABEL: combine_blend_123:
2179; SSE41:       # BB#0:
2180; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2181; SSE41-NEXT:    retq
2182;
2183; AVX-LABEL: combine_blend_123:
2184; AVX:       # BB#0:
2185; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2186; AVX-NEXT:    retq
2187  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
2188  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
2189  %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
2190  ret <4 x float> %shuffle12
2191}
2192
2193define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) {
2194; SSE-LABEL: combine_test_movhl_1:
2195; SSE:       # BB#0:
2196; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2197; SSE-NEXT:    movdqa %xmm1, %xmm0
2198; SSE-NEXT:    retq
2199;
2200; AVX-LABEL: combine_test_movhl_1:
2201; AVX:       # BB#0:
2202; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2203; AVX-NEXT:    retq
2204  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3>
2205  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3>
2206  ret <4 x i32> %2
2207}
2208
2209define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) {
2210; SSE-LABEL: combine_test_movhl_2:
2211; SSE:       # BB#0:
2212; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2213; SSE-NEXT:    movdqa %xmm1, %xmm0
2214; SSE-NEXT:    retq
2215;
2216; AVX-LABEL: combine_test_movhl_2:
2217; AVX:       # BB#0:
2218; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2219; AVX-NEXT:    retq
2220  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6>
2221  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2>
2222  ret <4 x i32> %2
2223}
2224
2225define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
2226; SSE-LABEL: combine_test_movhl_3:
2227; SSE:       # BB#0:
2228; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2229; SSE-NEXT:    movdqa %xmm1, %xmm0
2230; SSE-NEXT:    retq
2231;
2232; AVX-LABEL: combine_test_movhl_3:
2233; AVX:       # BB#0:
2234; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2235; AVX-NEXT:    retq
2236  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2>
2237  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2>
2238  ret <4 x i32> %2
2239}
2240
2241
2242; Verify that we fold shuffles according to rule:
2243;  (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2)
2244
2245define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) {
2246; SSE2-LABEL: combine_undef_input_test1:
2247; SSE2:       # BB#0:
2248; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2249; SSE2-NEXT:    retq
2250;
2251; SSSE3-LABEL: combine_undef_input_test1:
2252; SSSE3:       # BB#0:
2253; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2254; SSSE3-NEXT:    retq
2255;
2256; SSE41-LABEL: combine_undef_input_test1:
2257; SSE41:       # BB#0:
2258; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2259; SSE41-NEXT:    retq
2260;
2261; AVX-LABEL: combine_undef_input_test1:
2262; AVX:       # BB#0:
2263; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2264; AVX-NEXT:    retq
2265  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2266  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2267  ret <4 x float> %2
2268}
2269
2270define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) {
2271; SSE-LABEL: combine_undef_input_test2:
2272; SSE:       # BB#0:
2273; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2274; SSE-NEXT:    retq
2275;
2276; AVX-LABEL: combine_undef_input_test2:
2277; AVX:       # BB#0:
2278; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2279; AVX-NEXT:    retq
2280  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2281  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2282  ret <4 x float> %2
2283}
2284
2285define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) {
2286; SSE-LABEL: combine_undef_input_test3:
2287; SSE:       # BB#0:
2288; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2289; SSE-NEXT:    retq
2290;
2291; AVX-LABEL: combine_undef_input_test3:
2292; AVX:       # BB#0:
2293; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2294; AVX-NEXT:    retq
2295  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2296  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2297  ret <4 x float> %2
2298}
2299
2300define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
2301; SSE-LABEL: combine_undef_input_test4:
2302; SSE:       # BB#0:
2303; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2304; SSE-NEXT:    movapd %xmm1, %xmm0
2305; SSE-NEXT:    retq
2306;
2307; AVX-LABEL: combine_undef_input_test4:
2308; AVX:       # BB#0:
2309; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2310; AVX-NEXT:    retq
2311  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2312  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2313  ret <4 x float> %2
2314}
2315
2316define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) {
2317; SSE2-LABEL: combine_undef_input_test5:
2318; SSE2:       # BB#0:
2319; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2320; SSE2-NEXT:    movapd %xmm1, %xmm0
2321; SSE2-NEXT:    retq
2322;
2323; SSSE3-LABEL: combine_undef_input_test5:
2324; SSSE3:       # BB#0:
2325; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2326; SSSE3-NEXT:    movapd %xmm1, %xmm0
2327; SSSE3-NEXT:    retq
2328;
2329; SSE41-LABEL: combine_undef_input_test5:
2330; SSE41:       # BB#0:
2331; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2332; SSE41-NEXT:    retq
2333;
2334; AVX-LABEL: combine_undef_input_test5:
2335; AVX:       # BB#0:
2336; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2337; AVX-NEXT:    retq
2338  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2339  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2340  ret <4 x float> %2
2341}
2342
2343
2344; Verify that we fold shuffles according to rule:
2345;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2346
2347define <4 x float> @combine_undef_input_test6(<4 x float> %a) {
2348; ALL-LABEL: combine_undef_input_test6:
2349; ALL:       # BB#0:
2350; ALL-NEXT:    retq
2351  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2352  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2353  ret <4 x float> %2
2354}
2355
2356define <4 x float> @combine_undef_input_test7(<4 x float> %a) {
2357; SSE2-LABEL: combine_undef_input_test7:
2358; SSE2:       # BB#0:
2359; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2360; SSE2-NEXT:    retq
2361;
2362; SSSE3-LABEL: combine_undef_input_test7:
2363; SSSE3:       # BB#0:
2364; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2365; SSSE3-NEXT:    retq
2366;
2367; SSE41-LABEL: combine_undef_input_test7:
2368; SSE41:       # BB#0:
2369; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2370; SSE41-NEXT:    retq
2371;
2372; AVX-LABEL: combine_undef_input_test7:
2373; AVX:       # BB#0:
2374; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2375; AVX-NEXT:    retq
2376  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2377  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2378  ret <4 x float> %2
2379}
2380
2381define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
2382; SSE2-LABEL: combine_undef_input_test8:
2383; SSE2:       # BB#0:
2384; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2385; SSE2-NEXT:    retq
2386;
2387; SSSE3-LABEL: combine_undef_input_test8:
2388; SSSE3:       # BB#0:
2389; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2390; SSSE3-NEXT:    retq
2391;
2392; SSE41-LABEL: combine_undef_input_test8:
2393; SSE41:       # BB#0:
2394; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2395; SSE41-NEXT:    retq
2396;
2397; AVX-LABEL: combine_undef_input_test8:
2398; AVX:       # BB#0:
2399; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2400; AVX-NEXT:    retq
2401  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2402  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2403  ret <4 x float> %2
2404}
2405
2406define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
2407; SSE-LABEL: combine_undef_input_test9:
2408; SSE:       # BB#0:
2409; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
2410; SSE-NEXT:    retq
2411;
2412; AVX-LABEL: combine_undef_input_test9:
2413; AVX:       # BB#0:
2414; AVX-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
2415; AVX-NEXT:    retq
2416  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2417  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2418  ret <4 x float> %2
2419}
2420
2421define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
2422; ALL-LABEL: combine_undef_input_test10:
2423; ALL:       # BB#0:
2424; ALL-NEXT:    retq
2425  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2426  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2427  ret <4 x float> %2
2428}
2429
2430define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) {
2431; SSE2-LABEL: combine_undef_input_test11:
2432; SSE2:       # BB#0:
2433; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2434; SSE2-NEXT:    retq
2435;
2436; SSSE3-LABEL: combine_undef_input_test11:
2437; SSSE3:       # BB#0:
2438; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2439; SSSE3-NEXT:    retq
2440;
2441; SSE41-LABEL: combine_undef_input_test11:
2442; SSE41:       # BB#0:
2443; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2444; SSE41-NEXT:    retq
2445;
2446; AVX-LABEL: combine_undef_input_test11:
2447; AVX:       # BB#0:
2448; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2449; AVX-NEXT:    retq
2450  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2451  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6>
2452  ret <4 x float> %2
2453}
2454
2455define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) {
2456; SSE-LABEL: combine_undef_input_test12:
2457; SSE:       # BB#0:
2458; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2459; SSE-NEXT:    retq
2460;
2461; AVX-LABEL: combine_undef_input_test12:
2462; AVX:       # BB#0:
2463; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2464; AVX-NEXT:    retq
2465  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2466  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2467  ret <4 x float> %2
2468}
2469
2470define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) {
2471; SSE-LABEL: combine_undef_input_test13:
2472; SSE:       # BB#0:
2473; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2474; SSE-NEXT:    retq
2475;
2476; AVX-LABEL: combine_undef_input_test13:
2477; AVX:       # BB#0:
2478; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2479; AVX-NEXT:    retq
2480  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2481  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5>
2482  ret <4 x float> %2
2483}
2484
2485define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
2486; SSE-LABEL: combine_undef_input_test14:
2487; SSE:       # BB#0:
2488; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2489; SSE-NEXT:    movapd %xmm1, %xmm0
2490; SSE-NEXT:    retq
2491;
2492; AVX-LABEL: combine_undef_input_test14:
2493; AVX:       # BB#0:
2494; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2495; AVX-NEXT:    retq
2496  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2497  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2498  ret <4 x float> %2
2499}
2500
2501define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) {
2502; SSE2-LABEL: combine_undef_input_test15:
2503; SSE2:       # BB#0:
2504; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2505; SSE2-NEXT:    movapd %xmm1, %xmm0
2506; SSE2-NEXT:    retq
2507;
2508; SSSE3-LABEL: combine_undef_input_test15:
2509; SSSE3:       # BB#0:
2510; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2511; SSSE3-NEXT:    movapd %xmm1, %xmm0
2512; SSSE3-NEXT:    retq
2513;
2514; SSE41-LABEL: combine_undef_input_test15:
2515; SSE41:       # BB#0:
2516; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2517; SSE41-NEXT:    retq
2518;
2519; AVX-LABEL: combine_undef_input_test15:
2520; AVX:       # BB#0:
2521; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2522; AVX-NEXT:    retq
2523  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2524  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2525  ret <4 x float> %2
2526}
2527
2528
2529; Verify that shuffles are canonicalized according to rules:
2530;  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
2531;
2532; This allows to trigger the following combine rule:
2533;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2534;
2535; As a result, all the shuffle pairs in each function below should be
2536; combined into a single legal shuffle operation.
2537
2538define <4 x float> @combine_undef_input_test16(<4 x float> %a) {
2539; ALL-LABEL: combine_undef_input_test16:
2540; ALL:       # BB#0:
2541; ALL-NEXT:    retq
2542  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2543  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
2544  ret <4 x float> %2
2545}
2546
2547define <4 x float> @combine_undef_input_test17(<4 x float> %a) {
2548; SSE2-LABEL: combine_undef_input_test17:
2549; SSE2:       # BB#0:
2550; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2551; SSE2-NEXT:    retq
2552;
2553; SSSE3-LABEL: combine_undef_input_test17:
2554; SSSE3:       # BB#0:
2555; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2556; SSSE3-NEXT:    retq
2557;
2558; SSE41-LABEL: combine_undef_input_test17:
2559; SSE41:       # BB#0:
2560; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2561; SSE41-NEXT:    retq
2562;
2563; AVX-LABEL: combine_undef_input_test17:
2564; AVX:       # BB#0:
2565; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2566; AVX-NEXT:    retq
2567  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2568  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2569  ret <4 x float> %2
2570}
2571
2572define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
2573; SSE2-LABEL: combine_undef_input_test18:
2574; SSE2:       # BB#0:
2575; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2576; SSE2-NEXT:    retq
2577;
2578; SSSE3-LABEL: combine_undef_input_test18:
2579; SSSE3:       # BB#0:
2580; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2581; SSSE3-NEXT:    retq
2582;
2583; SSE41-LABEL: combine_undef_input_test18:
2584; SSE41:       # BB#0:
2585; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2586; SSE41-NEXT:    retq
2587;
2588; AVX-LABEL: combine_undef_input_test18:
2589; AVX:       # BB#0:
2590; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2591; AVX-NEXT:    retq
2592  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2593  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
2594  ret <4 x float> %2
2595}
2596
2597define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
2598; SSE-LABEL: combine_undef_input_test19:
2599; SSE:       # BB#0:
2600; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
2601; SSE-NEXT:    retq
2602;
2603; AVX-LABEL: combine_undef_input_test19:
2604; AVX:       # BB#0:
2605; AVX-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
2606; AVX-NEXT:    retq
2607  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2608  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2609  ret <4 x float> %2
2610}
2611
2612define <4 x float> @combine_undef_input_test20(<4 x float> %a) {
2613; ALL-LABEL: combine_undef_input_test20:
2614; ALL:       # BB#0:
2615; ALL-NEXT:    retq
2616  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2617  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2618  ret <4 x float> %2
2619}
2620
2621; These tests are designed to test the ability to combine away unnecessary
2622; operations feeding into a shuffle. The AVX cases are the important ones as
2623; they leverage operations which cannot be done naturally on the entire vector
2624; and thus are decomposed into multiple smaller operations.
2625
2626define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
2627; SSE-LABEL: combine_unneeded_subvector1:
2628; SSE:       # BB#0:
2629; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
2630; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0]
2631; SSE-NEXT:    movdqa %xmm0, %xmm1
2632; SSE-NEXT:    retq
2633;
2634; AVX1-LABEL: combine_unneeded_subvector1:
2635; AVX1:       # BB#0:
2636; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2637; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
2638; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
2639; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2640; AVX1-NEXT:    retq
2641;
2642; AVX2-LABEL: combine_unneeded_subvector1:
2643; AVX2:       # BB#0:
2644; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
2645; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
2646; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
2647; AVX2-NEXT:    retq
2648  %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2649  %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
2650  ret <8 x i32> %c
2651}
2652
2653define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
2654; SSE-LABEL: combine_unneeded_subvector2:
2655; SSE:       # BB#0:
2656; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
2657; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0]
2658; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
2659; SSE-NEXT:    retq
2660;
2661; AVX1-LABEL: combine_unneeded_subvector2:
2662; AVX1:       # BB#0:
2663; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2664; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
2665; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2666; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2667; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2668; AVX1-NEXT:    retq
2669;
2670; AVX2-LABEL: combine_unneeded_subvector2:
2671; AVX2:       # BB#0:
2672; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
2673; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2674; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2675; AVX2-NEXT:    retq
2676  %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2677  %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
2678  ret <8 x i32> %d
2679}
2680
2681define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
2682; SSE2-LABEL: combine_insertps1:
2683; SSE2:       # BB#0:
2684; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2685; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2686; SSE2-NEXT:    movaps %xmm1, %xmm0
2687; SSE2-NEXT:    retq
2688;
2689; SSSE3-LABEL: combine_insertps1:
2690; SSSE3:       # BB#0:
2691; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2692; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2693; SSSE3-NEXT:    movaps %xmm1, %xmm0
2694; SSSE3-NEXT:    retq
2695;
2696; SSE41-LABEL: combine_insertps1:
2697; SSE41:       # BB#0:
2698; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2699; SSE41-NEXT:    retq
2700;
2701; AVX-LABEL: combine_insertps1:
2702; AVX:       # BB#0:
2703; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2704; AVX-NEXT:    retq
2705
2706  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4>
2707  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
2708  ret <4 x float> %d
2709}
2710
2711define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
2712; SSE2-LABEL: combine_insertps2:
2713; SSE2:       # BB#0:
2714; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2715; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2716; SSE2-NEXT:    movaps %xmm1, %xmm0
2717; SSE2-NEXT:    retq
2718;
2719; SSSE3-LABEL: combine_insertps2:
2720; SSSE3:       # BB#0:
2721; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2722; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2723; SSSE3-NEXT:    movaps %xmm1, %xmm0
2724; SSSE3-NEXT:    retq
2725;
2726; SSE41-LABEL: combine_insertps2:
2727; SSE41:       # BB#0:
2728; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2729; SSE41-NEXT:    retq
2730;
2731; AVX-LABEL: combine_insertps2:
2732; AVX:       # BB#0:
2733; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2734; AVX-NEXT:    retq
2735
2736  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7>
2737  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2738  ret <4 x float> %d
2739}
2740
2741define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
2742; SSE2-LABEL: combine_insertps3:
2743; SSE2:       # BB#0:
2744; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2745; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2746; SSE2-NEXT:    retq
2747;
2748; SSSE3-LABEL: combine_insertps3:
2749; SSSE3:       # BB#0:
2750; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2751; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2752; SSSE3-NEXT:    retq
2753;
2754; SSE41-LABEL: combine_insertps3:
2755; SSE41:       # BB#0:
2756; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2757; SSE41-NEXT:    retq
2758;
2759; AVX-LABEL: combine_insertps3:
2760; AVX:       # BB#0:
2761; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2762; AVX-NEXT:    retq
2763
2764  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2765  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3>
2766  ret <4 x float> %d
2767}
2768
2769define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
2770; SSE2-LABEL: combine_insertps4:
2771; SSE2:       # BB#0:
2772; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
2773; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2774; SSE2-NEXT:    retq
2775;
2776; SSSE3-LABEL: combine_insertps4:
2777; SSSE3:       # BB#0:
2778; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
2779; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2780; SSSE3-NEXT:    retq
2781;
2782; SSE41-LABEL: combine_insertps4:
2783; SSE41:       # BB#0:
2784; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2785; SSE41-NEXT:    retq
2786;
2787; AVX-LABEL: combine_insertps4:
2788; AVX:       # BB#0:
2789; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2790; AVX-NEXT:    retq
2791
2792  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2793  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5>
2794  ret <4 x float> %d
2795}
2796
2797define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
2798; SSE-LABEL: PR22377:
2799; SSE:       # BB#0: # %entry
2800; SSE-NEXT:    movaps %xmm0, %xmm1
2801; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
2802; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
2803; SSE-NEXT:    addps %xmm0, %xmm1
2804; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2805; SSE-NEXT:    retq
2806;
2807; AVX-LABEL: PR22377:
2808; AVX:       # BB#0: # %entry
2809; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3]
2810; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
2811; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm1
2812; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2813; AVX-NEXT:    retq
2814entry:
2815  %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
2816  %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2817  %r2 = fadd <4 x float> %s1, %s2
2818  %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2819  ret <4 x float> %s3
2820}
2821
2822define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) {
2823; SSE2-LABEL: PR22390:
2824; SSE2:       # BB#0: # %entry
2825; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2826; SSE2-NEXT:    movaps %xmm0, %xmm2
2827; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2828; SSE2-NEXT:    addps %xmm0, %xmm2
2829; SSE2-NEXT:    movaps %xmm2, %xmm0
2830; SSE2-NEXT:    retq
2831;
2832; SSSE3-LABEL: PR22390:
2833; SSSE3:       # BB#0: # %entry
2834; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2835; SSSE3-NEXT:    movaps %xmm0, %xmm2
2836; SSSE3-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2837; SSSE3-NEXT:    addps %xmm0, %xmm2
2838; SSSE3-NEXT:    movaps %xmm2, %xmm0
2839; SSSE3-NEXT:    retq
2840;
2841; SSE41-LABEL: PR22390:
2842; SSE41:       # BB#0: # %entry
2843; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2844; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2845; SSE41-NEXT:    addps %xmm1, %xmm0
2846; SSE41-NEXT:    retq
2847;
2848; AVX-LABEL: PR22390:
2849; AVX:       # BB#0: # %entry
2850; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2851; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2852; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2853; AVX-NEXT:    retq
2854entry:
2855  %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
2856  %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
2857  %r2 = fadd <4 x float> %s1, %s2
2858  ret <4 x float> %r2
2859}
2860
2861define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
2862; SSE2-LABEL: PR22412:
2863; SSE2:       # BB#0: # %entry
2864; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2865; SSE2-NEXT:    movapd %xmm2, %xmm0
2866; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
2867; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
2868; SSE2-NEXT:    movaps %xmm3, %xmm1
2869; SSE2-NEXT:    retq
2870;
2871; SSSE3-LABEL: PR22412:
2872; SSSE3:       # BB#0: # %entry
2873; SSSE3-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2874; SSSE3-NEXT:    movapd %xmm2, %xmm0
2875; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
2876; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
2877; SSSE3-NEXT:    movaps %xmm3, %xmm1
2878; SSSE3-NEXT:    retq
2879;
2880; SSE41-LABEL: PR22412:
2881; SSE41:       # BB#0: # %entry
2882; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
2883; SSE41-NEXT:    movapd %xmm0, %xmm1
2884; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[3,2]
2885; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[3,2]
2886; SSE41-NEXT:    movaps %xmm1, %xmm0
2887; SSE41-NEXT:    movaps %xmm3, %xmm1
2888; SSE41-NEXT:    retq
2889;
2890; AVX1-LABEL: PR22412:
2891; AVX1:       # BB#0: # %entry
2892; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
2893; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
2894; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6]
2895; AVX1-NEXT:    retq
2896;
2897; AVX2-LABEL: PR22412:
2898; AVX2:       # BB#0: # %entry
2899; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
2900; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [1,0,7,6,5,4,3,2]
2901; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2902; AVX2-NEXT:    retq
2903entry:
2904  %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2905  %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2>
2906  ret <8 x float> %s2
2907}
2908