1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
4; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
5; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
6; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW
7; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST
8;
9; Verify that the DAG combiner correctly folds bitwise operations across
10; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
11; basic and always-safe patterns. Also test that the DAG combiner will combine
12; target-specific shuffle instructions where reasonable.
13
14target triple = "x86_64-unknown-unknown"
15
16declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
17declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
18declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
19
20define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
21; CHECK-LABEL: combine_pshufd1:
22; CHECK:       # %bb.0: # %entry
23; CHECK-NEXT:    retq
24entry:
25  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
26  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
27  ret <4 x i32> %c
28}
29
30define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
31; CHECK-LABEL: combine_pshufd2:
32; CHECK:       # %bb.0: # %entry
33; CHECK-NEXT:    retq
34entry:
35  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
36  %b.cast = bitcast <4 x i32> %b to <8 x i16>
37  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
38  %c.cast = bitcast <8 x i16> %c to <4 x i32>
39  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
40  ret <4 x i32> %d
41}
42
43define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
44; CHECK-LABEL: combine_pshufd3:
45; CHECK:       # %bb.0: # %entry
46; CHECK-NEXT:    retq
47entry:
48  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
49  %b.cast = bitcast <4 x i32> %b to <8 x i16>
50  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
51  %c.cast = bitcast <8 x i16> %c to <4 x i32>
52  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
53  ret <4 x i32> %d
54}
55
56define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
57; SSE-LABEL: combine_pshufd4:
58; SSE:       # %bb.0: # %entry
59; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
60; SSE-NEXT:    retq
61;
62; AVX-LABEL: combine_pshufd4:
63; AVX:       # %bb.0: # %entry
64; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
65; AVX-NEXT:    retq
66entry:
67  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31)
68  %b.cast = bitcast <4 x i32> %b to <8 x i16>
69  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
70  %c.cast = bitcast <8 x i16> %c to <4 x i32>
71  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31)
72  ret <4 x i32> %d
73}
74
75define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
76; SSE-LABEL: combine_pshufd5:
77; SSE:       # %bb.0: # %entry
78; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
79; SSE-NEXT:    retq
80;
81; AVX-LABEL: combine_pshufd5:
82; AVX:       # %bb.0: # %entry
83; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
84; AVX-NEXT:    retq
85entry:
86  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76)
87  %b.cast = bitcast <4 x i32> %b to <8 x i16>
88  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
89  %c.cast = bitcast <8 x i16> %c to <4 x i32>
90  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76)
91  ret <4 x i32> %d
92}
93
94define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
95; SSE-LABEL: combine_pshufd6:
96; SSE:       # %bb.0: # %entry
97; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
98; SSE-NEXT:    retq
99;
100; AVX1-LABEL: combine_pshufd6:
101; AVX1:       # %bb.0: # %entry
102; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
103; AVX1-NEXT:    retq
104;
105; AVX2-LABEL: combine_pshufd6:
106; AVX2:       # %bb.0: # %entry
107; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
108; AVX2-NEXT:    retq
109entry:
110  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
111  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
112  ret <4 x i32> %c
113}
114
115define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
116; CHECK-LABEL: combine_pshuflw1:
117; CHECK:       # %bb.0: # %entry
118; CHECK-NEXT:    retq
119entry:
120  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
121  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
122  ret <8 x i16> %c
123}
124
125define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
126; CHECK-LABEL: combine_pshuflw2:
127; CHECK:       # %bb.0: # %entry
128; CHECK-NEXT:    retq
129entry:
130  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
131  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
132  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
133  ret <8 x i16> %d
134}
135
136define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
137; SSE-LABEL: combine_pshuflw3:
138; SSE:       # %bb.0: # %entry
139; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
140; SSE-NEXT:    retq
141;
142; AVX-LABEL: combine_pshuflw3:
143; AVX:       # %bb.0: # %entry
144; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
145; AVX-NEXT:    retq
146entry:
147  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
148  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
149  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
150  ret <8 x i16> %d
151}
152
153define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
154; SSE-LABEL: combine_pshufhw1:
155; SSE:       # %bb.0: # %entry
156; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
157; SSE-NEXT:    retq
158;
159; AVX-LABEL: combine_pshufhw1:
160; AVX:       # %bb.0: # %entry
161; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
162; AVX-NEXT:    retq
163entry:
164  %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
165  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
166  %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
167  ret <8 x i16> %d
168}
169
170define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
171; SSE-LABEL: combine_bitwise_ops_test1:
172; SSE:       # %bb.0:
173; SSE-NEXT:    pand %xmm1, %xmm0
174; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
175; SSE-NEXT:    retq
176;
177; AVX-LABEL: combine_bitwise_ops_test1:
178; AVX:       # %bb.0:
179; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
180; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
181; AVX-NEXT:    retq
182  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
183  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
184  %and = and <4 x i32> %shuf1, %shuf2
185  ret <4 x i32> %and
186}
187
188define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
189; SSE-LABEL: combine_bitwise_ops_test2:
190; SSE:       # %bb.0:
191; SSE-NEXT:    por %xmm1, %xmm0
192; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
193; SSE-NEXT:    retq
194;
195; AVX-LABEL: combine_bitwise_ops_test2:
196; AVX:       # %bb.0:
197; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
198; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
199; AVX-NEXT:    retq
200  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
201  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
202  %or = or <4 x i32> %shuf1, %shuf2
203  ret <4 x i32> %or
204}
205
206define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
207; SSE-LABEL: combine_bitwise_ops_test3:
208; SSE:       # %bb.0:
209; SSE-NEXT:    pxor %xmm1, %xmm0
210; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
211; SSE-NEXT:    retq
212;
213; AVX-LABEL: combine_bitwise_ops_test3:
214; AVX:       # %bb.0:
215; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
216; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
217; AVX-NEXT:    retq
218  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
219  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
220  %xor = xor <4 x i32> %shuf1, %shuf2
221  ret <4 x i32> %xor
222}
223
224define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
225; SSE-LABEL: combine_bitwise_ops_test4:
226; SSE:       # %bb.0:
227; SSE-NEXT:    pand %xmm1, %xmm0
228; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
229; SSE-NEXT:    retq
230;
231; AVX-LABEL: combine_bitwise_ops_test4:
232; AVX:       # %bb.0:
233; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
234; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
235; AVX-NEXT:    retq
236  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
237  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
238  %and = and <4 x i32> %shuf1, %shuf2
239  ret <4 x i32> %and
240}
241
242define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
243; SSE-LABEL: combine_bitwise_ops_test5:
244; SSE:       # %bb.0:
245; SSE-NEXT:    por %xmm1, %xmm0
246; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
247; SSE-NEXT:    retq
248;
249; AVX-LABEL: combine_bitwise_ops_test5:
250; AVX:       # %bb.0:
251; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
252; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
253; AVX-NEXT:    retq
254  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
255  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
256  %or = or <4 x i32> %shuf1, %shuf2
257  ret <4 x i32> %or
258}
259
260define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
261; SSE-LABEL: combine_bitwise_ops_test6:
262; SSE:       # %bb.0:
263; SSE-NEXT:    pxor %xmm1, %xmm0
264; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
265; SSE-NEXT:    retq
266;
267; AVX-LABEL: combine_bitwise_ops_test6:
268; AVX:       # %bb.0:
269; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
270; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
271; AVX-NEXT:    retq
272  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
273  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
274  %xor = xor <4 x i32> %shuf1, %shuf2
275  ret <4 x i32> %xor
276}
277
278
279; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
280; are not performing a swizzle operations.
281
282define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
283; SSE2-LABEL: combine_bitwise_ops_test1b:
284; SSE2:       # %bb.0:
285; SSE2-NEXT:    pand %xmm1, %xmm0
286; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
287; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
288; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
289; SSE2-NEXT:    retq
290;
291; SSSE3-LABEL: combine_bitwise_ops_test1b:
292; SSSE3:       # %bb.0:
293; SSSE3-NEXT:    pand %xmm1, %xmm0
294; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
295; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
296; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
297; SSSE3-NEXT:    retq
298;
299; SSE41-LABEL: combine_bitwise_ops_test1b:
300; SSE41:       # %bb.0:
301; SSE41-NEXT:    andps %xmm1, %xmm0
302; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
303; SSE41-NEXT:    retq
304;
305; AVX-LABEL: combine_bitwise_ops_test1b:
306; AVX:       # %bb.0:
307; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
308; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
309; AVX-NEXT:    retq
310  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
311  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
312  %and = and <4 x i32> %shuf1, %shuf2
313  ret <4 x i32> %and
314}
315
316define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
317; SSE2-LABEL: combine_bitwise_ops_test2b:
318; SSE2:       # %bb.0:
319; SSE2-NEXT:    por %xmm1, %xmm0
320; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
321; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
322; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
323; SSE2-NEXT:    retq
324;
325; SSSE3-LABEL: combine_bitwise_ops_test2b:
326; SSSE3:       # %bb.0:
327; SSSE3-NEXT:    por %xmm1, %xmm0
328; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
329; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
330; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
331; SSSE3-NEXT:    retq
332;
333; SSE41-LABEL: combine_bitwise_ops_test2b:
334; SSE41:       # %bb.0:
335; SSE41-NEXT:    orps %xmm1, %xmm0
336; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
337; SSE41-NEXT:    retq
338;
339; AVX-LABEL: combine_bitwise_ops_test2b:
340; AVX:       # %bb.0:
341; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
342; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
343; AVX-NEXT:    retq
344  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
345  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
346  %or = or <4 x i32> %shuf1, %shuf2
347  ret <4 x i32> %or
348}
349
350define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
351; SSE2-LABEL: combine_bitwise_ops_test3b:
352; SSE2:       # %bb.0:
353; SSE2-NEXT:    xorps %xmm1, %xmm0
354; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
355; SSE2-NEXT:    retq
356;
357; SSSE3-LABEL: combine_bitwise_ops_test3b:
358; SSSE3:       # %bb.0:
359; SSSE3-NEXT:    xorps %xmm1, %xmm0
360; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
361; SSSE3-NEXT:    retq
362;
363; SSE41-LABEL: combine_bitwise_ops_test3b:
364; SSE41:       # %bb.0:
365; SSE41-NEXT:    xorps %xmm1, %xmm0
366; SSE41-NEXT:    xorps %xmm1, %xmm1
367; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
368; SSE41-NEXT:    retq
369;
370; AVX-LABEL: combine_bitwise_ops_test3b:
371; AVX:       # %bb.0:
372; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
373; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
374; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
375; AVX-NEXT:    retq
376  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
377  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
378  %xor = xor <4 x i32> %shuf1, %shuf2
379  ret <4 x i32> %xor
380}
381
382define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
383; SSE2-LABEL: combine_bitwise_ops_test4b:
384; SSE2:       # %bb.0:
385; SSE2-NEXT:    pand %xmm1, %xmm0
386; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
387; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
388; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
389; SSE2-NEXT:    retq
390;
391; SSSE3-LABEL: combine_bitwise_ops_test4b:
392; SSSE3:       # %bb.0:
393; SSSE3-NEXT:    pand %xmm1, %xmm0
394; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
395; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
396; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
397; SSSE3-NEXT:    retq
398;
399; SSE41-LABEL: combine_bitwise_ops_test4b:
400; SSE41:       # %bb.0:
401; SSE41-NEXT:    andps %xmm1, %xmm0
402; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
403; SSE41-NEXT:    retq
404;
405; AVX-LABEL: combine_bitwise_ops_test4b:
406; AVX:       # %bb.0:
407; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
408; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
409; AVX-NEXT:    retq
410  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
411  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
412  %and = and <4 x i32> %shuf1, %shuf2
413  ret <4 x i32> %and
414}
415
416define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
417; SSE2-LABEL: combine_bitwise_ops_test5b:
418; SSE2:       # %bb.0:
419; SSE2-NEXT:    por %xmm1, %xmm0
420; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
421; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
422; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
423; SSE2-NEXT:    retq
424;
425; SSSE3-LABEL: combine_bitwise_ops_test5b:
426; SSSE3:       # %bb.0:
427; SSSE3-NEXT:    por %xmm1, %xmm0
428; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
429; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
430; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
431; SSSE3-NEXT:    retq
432;
433; SSE41-LABEL: combine_bitwise_ops_test5b:
434; SSE41:       # %bb.0:
435; SSE41-NEXT:    orps %xmm1, %xmm0
436; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
437; SSE41-NEXT:    retq
438;
439; AVX-LABEL: combine_bitwise_ops_test5b:
440; AVX:       # %bb.0:
441; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
442; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
443; AVX-NEXT:    retq
444  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
445  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
446  %or = or <4 x i32> %shuf1, %shuf2
447  ret <4 x i32> %or
448}
449
450define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
451; SSE2-LABEL: combine_bitwise_ops_test6b:
452; SSE2:       # %bb.0:
453; SSE2-NEXT:    xorps %xmm1, %xmm0
454; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
455; SSE2-NEXT:    retq
456;
457; SSSE3-LABEL: combine_bitwise_ops_test6b:
458; SSSE3:       # %bb.0:
459; SSSE3-NEXT:    xorps %xmm1, %xmm0
460; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
461; SSSE3-NEXT:    retq
462;
463; SSE41-LABEL: combine_bitwise_ops_test6b:
464; SSE41:       # %bb.0:
465; SSE41-NEXT:    xorps %xmm1, %xmm0
466; SSE41-NEXT:    xorps %xmm1, %xmm1
467; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
468; SSE41-NEXT:    retq
469;
470; AVX-LABEL: combine_bitwise_ops_test6b:
471; AVX:       # %bb.0:
472; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
473; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
474; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
475; AVX-NEXT:    retq
476  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
477  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
478  %xor = xor <4 x i32> %shuf1, %shuf2
479  ret <4 x i32> %xor
480}
481
482define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
483; SSE-LABEL: combine_bitwise_ops_test1c:
484; SSE:       # %bb.0:
485; SSE-NEXT:    andps %xmm1, %xmm0
486; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
487; SSE-NEXT:    retq
488;
489; AVX-LABEL: combine_bitwise_ops_test1c:
490; AVX:       # %bb.0:
491; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
492; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
493; AVX-NEXT:    retq
494  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
495  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
496  %and = and <4 x i32> %shuf1, %shuf2
497  ret <4 x i32> %and
498}
499
500define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
501; SSE-LABEL: combine_bitwise_ops_test2c:
502; SSE:       # %bb.0:
503; SSE-NEXT:    orps %xmm1, %xmm0
504; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
505; SSE-NEXT:    retq
506;
507; AVX-LABEL: combine_bitwise_ops_test2c:
508; AVX:       # %bb.0:
509; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
510; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
511; AVX-NEXT:    retq
512  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
513  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
514  %or = or <4 x i32> %shuf1, %shuf2
515  ret <4 x i32> %or
516}
517
518define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
519; SSE2-LABEL: combine_bitwise_ops_test3c:
520; SSE2:       # %bb.0:
521; SSE2-NEXT:    xorps %xmm1, %xmm0
522; SSE2-NEXT:    xorps %xmm1, %xmm1
523; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
524; SSE2-NEXT:    retq
525;
526; SSSE3-LABEL: combine_bitwise_ops_test3c:
527; SSSE3:       # %bb.0:
528; SSSE3-NEXT:    xorps %xmm1, %xmm0
529; SSSE3-NEXT:    xorps %xmm1, %xmm1
530; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
531; SSSE3-NEXT:    retq
532;
533; SSE41-LABEL: combine_bitwise_ops_test3c:
534; SSE41:       # %bb.0:
535; SSE41-NEXT:    xorps %xmm1, %xmm0
536; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
537; SSE41-NEXT:    retq
538;
539; AVX-LABEL: combine_bitwise_ops_test3c:
540; AVX:       # %bb.0:
541; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
542; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
543; AVX-NEXT:    retq
544  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
545  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
546  %xor = xor <4 x i32> %shuf1, %shuf2
547  ret <4 x i32> %xor
548}
549
550define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
551; SSE-LABEL: combine_bitwise_ops_test4c:
552; SSE:       # %bb.0:
553; SSE-NEXT:    andps %xmm1, %xmm0
554; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
555; SSE-NEXT:    movaps %xmm2, %xmm0
556; SSE-NEXT:    retq
557;
558; AVX-LABEL: combine_bitwise_ops_test4c:
559; AVX:       # %bb.0:
560; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
561; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
562; AVX-NEXT:    retq
563  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
564  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
565  %and = and <4 x i32> %shuf1, %shuf2
566  ret <4 x i32> %and
567}
568
569define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
570; SSE-LABEL: combine_bitwise_ops_test5c:
571; SSE:       # %bb.0:
572; SSE-NEXT:    orps %xmm1, %xmm0
573; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
574; SSE-NEXT:    movaps %xmm2, %xmm0
575; SSE-NEXT:    retq
576;
577; AVX-LABEL: combine_bitwise_ops_test5c:
578; AVX:       # %bb.0:
579; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
580; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
581; AVX-NEXT:    retq
582  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
583  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
584  %or = or <4 x i32> %shuf1, %shuf2
585  ret <4 x i32> %or
586}
587
588define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
589; SSE2-LABEL: combine_bitwise_ops_test6c:
590; SSE2:       # %bb.0:
591; SSE2-NEXT:    xorps %xmm1, %xmm0
592; SSE2-NEXT:    xorps %xmm1, %xmm1
593; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
594; SSE2-NEXT:    movaps %xmm1, %xmm0
595; SSE2-NEXT:    retq
596;
597; SSSE3-LABEL: combine_bitwise_ops_test6c:
598; SSSE3:       # %bb.0:
599; SSSE3-NEXT:    xorps %xmm1, %xmm0
600; SSSE3-NEXT:    xorps %xmm1, %xmm1
601; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
602; SSSE3-NEXT:    movaps %xmm1, %xmm0
603; SSSE3-NEXT:    retq
604;
605; SSE41-LABEL: combine_bitwise_ops_test6c:
606; SSE41:       # %bb.0:
607; SSE41-NEXT:    xorps %xmm1, %xmm0
608; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3]
609; SSE41-NEXT:    retq
610;
611; AVX-LABEL: combine_bitwise_ops_test6c:
612; AVX:       # %bb.0:
613; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
614; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3]
615; AVX-NEXT:    retq
616  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
617  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
618  %xor = xor <4 x i32> %shuf1, %shuf2
619  ret <4 x i32> %xor
620}
621
622define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) {
623; SSE-LABEL: combine_nested_undef_test1:
624; SSE:       # %bb.0:
625; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
626; SSE-NEXT:    retq
627;
628; AVX-LABEL: combine_nested_undef_test1:
629; AVX:       # %bb.0:
630; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
631; AVX-NEXT:    retq
632  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
633  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
634  ret <4 x i32> %2
635}
636
637define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) {
638; SSE-LABEL: combine_nested_undef_test2:
639; SSE:       # %bb.0:
640; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
641; SSE-NEXT:    retq
642;
643; AVX-LABEL: combine_nested_undef_test2:
644; AVX:       # %bb.0:
645; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
646; AVX-NEXT:    retq
647  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
648  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
649  ret <4 x i32> %2
650}
651
652define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) {
653; SSE-LABEL: combine_nested_undef_test3:
654; SSE:       # %bb.0:
655; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
656; SSE-NEXT:    retq
657;
658; AVX-LABEL: combine_nested_undef_test3:
659; AVX:       # %bb.0:
660; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
661; AVX-NEXT:    retq
662  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
663  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
664  ret <4 x i32> %2
665}
666
667define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) {
668; SSE-LABEL: combine_nested_undef_test4:
669; SSE:       # %bb.0:
670; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
671; SSE-NEXT:    retq
672;
673; AVX1-LABEL: combine_nested_undef_test4:
674; AVX1:       # %bb.0:
675; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
676; AVX1-NEXT:    retq
677;
678; AVX2-LABEL: combine_nested_undef_test4:
679; AVX2:       # %bb.0:
680; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
681; AVX2-NEXT:    retq
682  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
683  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
684  ret <4 x i32> %2
685}
686
687define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) {
688; SSE-LABEL: combine_nested_undef_test5:
689; SSE:       # %bb.0:
690; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
691; SSE-NEXT:    retq
692;
693; AVX-LABEL: combine_nested_undef_test5:
694; AVX:       # %bb.0:
695; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
696; AVX-NEXT:    retq
697  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
698  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
699  ret <4 x i32> %2
700}
701
702define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) {
703; SSE-LABEL: combine_nested_undef_test6:
704; SSE:       # %bb.0:
705; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
706; SSE-NEXT:    retq
707;
708; AVX-LABEL: combine_nested_undef_test6:
709; AVX:       # %bb.0:
710; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
711; AVX-NEXT:    retq
712  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
713  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
714  ret <4 x i32> %2
715}
716
717define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) {
718; SSE-LABEL: combine_nested_undef_test7:
719; SSE:       # %bb.0:
720; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
721; SSE-NEXT:    retq
722;
723; AVX-LABEL: combine_nested_undef_test7:
724; AVX:       # %bb.0:
725; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
726; AVX-NEXT:    retq
727  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
728  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
729  ret <4 x i32> %2
730}
731
732define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) {
733; SSE-LABEL: combine_nested_undef_test8:
734; SSE:       # %bb.0:
735; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
736; SSE-NEXT:    retq
737;
738; AVX-LABEL: combine_nested_undef_test8:
739; AVX:       # %bb.0:
740; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3]
741; AVX-NEXT:    retq
742  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
743  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
744  ret <4 x i32> %2
745}
746
747define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) {
748; SSE-LABEL: combine_nested_undef_test9:
749; SSE:       # %bb.0:
750; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
751; SSE-NEXT:    retq
752;
753; AVX-LABEL: combine_nested_undef_test9:
754; AVX:       # %bb.0:
755; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,2]
756; AVX-NEXT:    retq
757  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
758  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
759  ret <4 x i32> %2
760}
761
762define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) {
763; SSE-LABEL: combine_nested_undef_test10:
764; SSE:       # %bb.0:
765; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
766; SSE-NEXT:    retq
767;
768; AVX-LABEL: combine_nested_undef_test10:
769; AVX:       # %bb.0:
770; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
771; AVX-NEXT:    retq
772  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
773  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
774  ret <4 x i32> %2
775}
776
777define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) {
778; SSE-LABEL: combine_nested_undef_test11:
779; SSE:       # %bb.0:
780; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
781; SSE-NEXT:    retq
782;
783; AVX-LABEL: combine_nested_undef_test11:
784; AVX:       # %bb.0:
785; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,1]
786; AVX-NEXT:    retq
787  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
788  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
789  ret <4 x i32> %2
790}
791
792define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
793; SSE-LABEL: combine_nested_undef_test12:
794; SSE:       # %bb.0:
795; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
796; SSE-NEXT:    retq
797;
798; AVX1-LABEL: combine_nested_undef_test12:
799; AVX1:       # %bb.0:
800; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
801; AVX1-NEXT:    retq
802;
803; AVX2-LABEL: combine_nested_undef_test12:
804; AVX2:       # %bb.0:
805; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
806; AVX2-NEXT:    retq
807  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
808  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
809  ret <4 x i32> %2
810}
811
812; The following pair of shuffles is folded into vector %A.
813define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
814; CHECK-LABEL: combine_nested_undef_test13:
815; CHECK:       # %bb.0:
816; CHECK-NEXT:    retq
817  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
818  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
819  ret <4 x i32> %2
820}
821
822; The following pair of shuffles is folded into vector %B.
823define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
824; SSE-LABEL: combine_nested_undef_test14:
825; SSE:       # %bb.0:
826; SSE-NEXT:    movaps %xmm1, %xmm0
827; SSE-NEXT:    retq
828;
829; AVX-LABEL: combine_nested_undef_test14:
830; AVX:       # %bb.0:
831; AVX-NEXT:    vmovaps %xmm1, %xmm0
832; AVX-NEXT:    retq
833  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
834  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
835  ret <4 x i32> %2
836}
837
838
839; Verify that we don't optimize the following cases. We expect more than one shuffle.
840;
841; FIXME: Many of these already don't make sense, and the rest should stop
842; making sense with th enew vector shuffle lowering. Revisit at least testing for
843; it.
844
845define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
846; SSE2-LABEL: combine_nested_undef_test15:
847; SSE2:       # %bb.0:
848; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
849; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
850; SSE2-NEXT:    movaps %xmm1, %xmm0
851; SSE2-NEXT:    retq
852;
853; SSSE3-LABEL: combine_nested_undef_test15:
854; SSSE3:       # %bb.0:
855; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
856; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
857; SSSE3-NEXT:    movaps %xmm1, %xmm0
858; SSSE3-NEXT:    retq
859;
860; SSE41-LABEL: combine_nested_undef_test15:
861; SSE41:       # %bb.0:
862; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
863; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
864; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
865; SSE41-NEXT:    retq
866;
867; AVX1-LABEL: combine_nested_undef_test15:
868; AVX1:       # %bb.0:
869; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
870; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
871; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
872; AVX1-NEXT:    retq
873;
874; AVX2-LABEL: combine_nested_undef_test15:
875; AVX2:       # %bb.0:
876; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
877; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
878; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
879; AVX2-NEXT:    retq
880  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
881  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
882  ret <4 x i32> %2
883}
884
885define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
886; SSE2-LABEL: combine_nested_undef_test16:
887; SSE2:       # %bb.0:
888; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
889; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
890; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
891; SSE2-NEXT:    retq
892;
893; SSSE3-LABEL: combine_nested_undef_test16:
894; SSSE3:       # %bb.0:
895; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
896; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
897; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
898; SSSE3-NEXT:    retq
899;
900; SSE41-LABEL: combine_nested_undef_test16:
901; SSE41:       # %bb.0:
902; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
903; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
904; SSE41-NEXT:    retq
905;
906; AVX-LABEL: combine_nested_undef_test16:
907; AVX:       # %bb.0:
908; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
909; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
910; AVX-NEXT:    retq
911  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
912  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
913  ret <4 x i32> %2
914}
915
916define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
917; SSE2-LABEL: combine_nested_undef_test17:
918; SSE2:       # %bb.0:
919; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
920; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
921; SSE2-NEXT:    retq
922;
923; SSSE3-LABEL: combine_nested_undef_test17:
924; SSSE3:       # %bb.0:
925; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
926; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
927; SSSE3-NEXT:    retq
928;
929; SSE41-LABEL: combine_nested_undef_test17:
930; SSE41:       # %bb.0:
931; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
932; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
933; SSE41-NEXT:    retq
934;
935; AVX-LABEL: combine_nested_undef_test17:
936; AVX:       # %bb.0:
937; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
938; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
939; AVX-NEXT:    retq
940  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
941  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
942  ret <4 x i32> %2
943}
944
945define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
946; SSE-LABEL: combine_nested_undef_test18:
947; SSE:       # %bb.0:
948; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
949; SSE-NEXT:    retq
950;
951; AVX-LABEL: combine_nested_undef_test18:
952; AVX:       # %bb.0:
953; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[1,1,0,3]
954; AVX-NEXT:    retq
955  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
956  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
957  ret <4 x i32> %2
958}
959
960define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
961; SSE2-LABEL: combine_nested_undef_test19:
962; SSE2:       # %bb.0:
963; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
964; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
965; SSE2-NEXT:    retq
966;
967; SSSE3-LABEL: combine_nested_undef_test19:
968; SSSE3:       # %bb.0:
969; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
970; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
971; SSSE3-NEXT:    retq
972;
973; SSE41-LABEL: combine_nested_undef_test19:
974; SSE41:       # %bb.0:
975; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
976; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
977; SSE41-NEXT:    retq
978;
979; AVX-LABEL: combine_nested_undef_test19:
980; AVX:       # %bb.0:
981; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
982; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
983; AVX-NEXT:    retq
984  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
985  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
986  ret <4 x i32> %2
987}
988
989define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
990; SSE2-LABEL: combine_nested_undef_test20:
991; SSE2:       # %bb.0:
992; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
993; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
994; SSE2-NEXT:    movaps %xmm1, %xmm0
995; SSE2-NEXT:    retq
996;
997; SSSE3-LABEL: combine_nested_undef_test20:
998; SSSE3:       # %bb.0:
999; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
1000; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1001; SSSE3-NEXT:    movaps %xmm1, %xmm0
1002; SSSE3-NEXT:    retq
1003;
1004; SSE41-LABEL: combine_nested_undef_test20:
1005; SSE41:       # %bb.0:
1006; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1007; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1008; SSE41-NEXT:    retq
1009;
1010; AVX-LABEL: combine_nested_undef_test20:
1011; AVX:       # %bb.0:
1012; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1013; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,3,0]
1014; AVX-NEXT:    retq
1015  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
1016  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1017  ret <4 x i32> %2
1018}
1019
1020define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
1021; SSE2-LABEL: combine_nested_undef_test21:
1022; SSE2:       # %bb.0:
1023; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1024; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
1025; SSE2-NEXT:    retq
1026;
1027; SSSE3-LABEL: combine_nested_undef_test21:
1028; SSSE3:       # %bb.0:
1029; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1030; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
1031; SSSE3-NEXT:    retq
1032;
1033; SSE41-LABEL: combine_nested_undef_test21:
1034; SSE41:       # %bb.0:
1035; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1036; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1037; SSE41-NEXT:    retq
1038;
1039; AVX1-LABEL: combine_nested_undef_test21:
1040; AVX1:       # %bb.0:
1041; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1042; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1043; AVX1-NEXT:    retq
1044;
1045; AVX2-LABEL: combine_nested_undef_test21:
1046; AVX2:       # %bb.0:
1047; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1048; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1049; AVX2-NEXT:    retq
1050  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
1051  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1052  ret <4 x i32> %2
1053}
1054
1055
1056; Test that we correctly combine shuffles according to rule
1057;  shuffle(shuffle(x, y), undef) -> shuffle(y, undef)
1058
1059define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) {
1060; SSE-LABEL: combine_nested_undef_test22:
1061; SSE:       # %bb.0:
1062; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
1063; SSE-NEXT:    retq
1064;
1065; AVX-LABEL: combine_nested_undef_test22:
1066; AVX:       # %bb.0:
1067; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[1,1,1,3]
1068; AVX-NEXT:    retq
1069  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1070  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3>
1071  ret <4 x i32> %2
1072}
1073
1074define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) {
1075; SSE-LABEL: combine_nested_undef_test23:
1076; SSE:       # %bb.0:
1077; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
1078; SSE-NEXT:    retq
1079;
1080; AVX-LABEL: combine_nested_undef_test23:
1081; AVX:       # %bb.0:
1082; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[0,1,0,3]
1083; AVX-NEXT:    retq
1084  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1085  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1086  ret <4 x i32> %2
1087}
1088
1089define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) {
1090; SSE-LABEL: combine_nested_undef_test24:
1091; SSE:       # %bb.0:
1092; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
1093; SSE-NEXT:    retq
1094;
1095; AVX-LABEL: combine_nested_undef_test24:
1096; AVX:       # %bb.0:
1097; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[0,3,2,3]
1098; AVX-NEXT:    retq
1099  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1100  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4>
1101  ret <4 x i32> %2
1102}
1103
1104define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) {
1105; SSE-LABEL: combine_nested_undef_test25:
1106; SSE:       # %bb.0:
1107; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1108; SSE-NEXT:    retq
1109;
1110; AVX1-LABEL: combine_nested_undef_test25:
1111; AVX1:       # %bb.0:
1112; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1113; AVX1-NEXT:    retq
1114;
1115; AVX2-LABEL: combine_nested_undef_test25:
1116; AVX2:       # %bb.0:
1117; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1118; AVX2-NEXT:    retq
1119  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4>
1120  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1>
1121  ret <4 x i32> %2
1122}
1123
1124define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) {
1125; SSE-LABEL: combine_nested_undef_test26:
1126; SSE:       # %bb.0:
1127; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1128; SSE-NEXT:    retq
1129;
1130; AVX-LABEL: combine_nested_undef_test26:
1131; AVX:       # %bb.0:
1132; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
1133; AVX-NEXT:    retq
1134  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7>
1135  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
1136  ret <4 x i32> %2
1137}
1138
1139define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) {
1140; SSE-LABEL: combine_nested_undef_test27:
1141; SSE:       # %bb.0:
1142; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1143; SSE-NEXT:    retq
1144;
1145; AVX1-LABEL: combine_nested_undef_test27:
1146; AVX1:       # %bb.0:
1147; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1148; AVX1-NEXT:    retq
1149;
1150; AVX2-LABEL: combine_nested_undef_test27:
1151; AVX2:       # %bb.0:
1152; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1153; AVX2-NEXT:    retq
1154  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4>
1155  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
1156  ret <4 x i32> %2
1157}
1158
1159define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
1160; SSE-LABEL: combine_nested_undef_test28:
1161; SSE:       # %bb.0:
1162; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
1163; SSE-NEXT:    retq
1164;
1165; AVX-LABEL: combine_nested_undef_test28:
1166; AVX:       # %bb.0:
1167; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0]
1168; AVX-NEXT:    retq
1169  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
1170  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2>
1171  ret <4 x i32> %2
1172}
1173
1174define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
1175; SSE-LABEL: combine_test1:
1176; SSE:       # %bb.0:
1177; SSE-NEXT:    movaps %xmm1, %xmm0
1178; SSE-NEXT:    retq
1179;
1180; AVX-LABEL: combine_test1:
1181; AVX:       # %bb.0:
1182; AVX-NEXT:    vmovaps %xmm1, %xmm0
1183; AVX-NEXT:    retq
1184  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1185  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1186  ret <4 x float> %2
1187}
1188
1189define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
1190; SSE2-LABEL: combine_test2:
1191; SSE2:       # %bb.0:
1192; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1193; SSE2-NEXT:    movaps %xmm1, %xmm0
1194; SSE2-NEXT:    retq
1195;
1196; SSSE3-LABEL: combine_test2:
1197; SSSE3:       # %bb.0:
1198; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1199; SSSE3-NEXT:    movaps %xmm1, %xmm0
1200; SSSE3-NEXT:    retq
1201;
1202; SSE41-LABEL: combine_test2:
1203; SSE41:       # %bb.0:
1204; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1205; SSE41-NEXT:    retq
1206;
1207; AVX-LABEL: combine_test2:
1208; AVX:       # %bb.0:
1209; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1210; AVX-NEXT:    retq
1211  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1212  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1213  ret <4 x float> %2
1214}
1215
1216define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
1217; SSE-LABEL: combine_test3:
1218; SSE:       # %bb.0:
1219; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1220; SSE-NEXT:    retq
1221;
1222; AVX-LABEL: combine_test3:
1223; AVX:       # %bb.0:
1224; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1225; AVX-NEXT:    retq
1226  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1227  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1228  ret <4 x float> %2
1229}
1230
1231define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
1232; SSE-LABEL: combine_test4:
1233; SSE:       # %bb.0:
1234; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1235; SSE-NEXT:    retq
1236;
1237; AVX-LABEL: combine_test4:
1238; AVX:       # %bb.0:
1239; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1240; AVX-NEXT:    retq
1241  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1242  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1243  ret <4 x float> %2
1244}
1245
1246define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
1247; SSE2-LABEL: combine_test5:
1248; SSE2:       # %bb.0:
1249; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1250; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1251; SSE2-NEXT:    retq
1252;
1253; SSSE3-LABEL: combine_test5:
1254; SSSE3:       # %bb.0:
1255; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1256; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1257; SSSE3-NEXT:    retq
1258;
1259; SSE41-LABEL: combine_test5:
1260; SSE41:       # %bb.0:
1261; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1262; SSE41-NEXT:    retq
1263;
1264; AVX-LABEL: combine_test5:
1265; AVX:       # %bb.0:
1266; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1267; AVX-NEXT:    retq
1268  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1269  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1270  ret <4 x float> %2
1271}
1272
1273define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
1274; SSE-LABEL: combine_test6:
1275; SSE:       # %bb.0:
1276; SSE-NEXT:    movaps %xmm1, %xmm0
1277; SSE-NEXT:    retq
1278;
1279; AVX-LABEL: combine_test6:
1280; AVX:       # %bb.0:
1281; AVX-NEXT:    vmovaps %xmm1, %xmm0
1282; AVX-NEXT:    retq
1283  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1284  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1285  ret <4 x i32> %2
1286}
1287
1288define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
1289; SSE2-LABEL: combine_test7:
1290; SSE2:       # %bb.0:
1291; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1292; SSE2-NEXT:    movaps %xmm1, %xmm0
1293; SSE2-NEXT:    retq
1294;
1295; SSSE3-LABEL: combine_test7:
1296; SSSE3:       # %bb.0:
1297; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1298; SSSE3-NEXT:    movaps %xmm1, %xmm0
1299; SSSE3-NEXT:    retq
1300;
1301; SSE41-LABEL: combine_test7:
1302; SSE41:       # %bb.0:
1303; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1304; SSE41-NEXT:    retq
1305;
1306; AVX-LABEL: combine_test7:
1307; AVX:       # %bb.0:
1308; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1309; AVX-NEXT:    retq
1310  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1311  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1312  ret <4 x i32> %2
1313}
1314
1315define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) {
1316; SSE-LABEL: combine_test8:
1317; SSE:       # %bb.0:
1318; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1319; SSE-NEXT:    retq
1320;
1321; AVX-LABEL: combine_test8:
1322; AVX:       # %bb.0:
1323; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1324; AVX-NEXT:    retq
1325  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1326  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1327  ret <4 x i32> %2
1328}
1329
1330define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
1331; SSE-LABEL: combine_test9:
1332; SSE:       # %bb.0:
1333; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1334; SSE-NEXT:    movaps %xmm1, %xmm0
1335; SSE-NEXT:    retq
1336;
1337; AVX-LABEL: combine_test9:
1338; AVX:       # %bb.0:
1339; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1340; AVX-NEXT:    retq
1341  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1342  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1343  ret <4 x i32> %2
1344}
1345
1346define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
1347; SSE2-LABEL: combine_test10:
1348; SSE2:       # %bb.0:
1349; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1350; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1351; SSE2-NEXT:    retq
1352;
1353; SSSE3-LABEL: combine_test10:
1354; SSSE3:       # %bb.0:
1355; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1356; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1357; SSSE3-NEXT:    retq
1358;
1359; SSE41-LABEL: combine_test10:
1360; SSE41:       # %bb.0:
1361; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1362; SSE41-NEXT:    retq
1363;
1364; AVX-LABEL: combine_test10:
1365; AVX:       # %bb.0:
1366; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1367; AVX-NEXT:    retq
1368  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1369  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1370  ret <4 x i32> %2
1371}
1372
1373define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
1374; CHECK-LABEL: combine_test11:
1375; CHECK:       # %bb.0:
1376; CHECK-NEXT:    retq
1377  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1378  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1379  ret <4 x float> %2
1380}
1381
1382define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
1383; SSE2-LABEL: combine_test12:
1384; SSE2:       # %bb.0:
1385; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1386; SSE2-NEXT:    movaps %xmm1, %xmm0
1387; SSE2-NEXT:    retq
1388;
1389; SSSE3-LABEL: combine_test12:
1390; SSSE3:       # %bb.0:
1391; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1392; SSSE3-NEXT:    movaps %xmm1, %xmm0
1393; SSSE3-NEXT:    retq
1394;
1395; SSE41-LABEL: combine_test12:
1396; SSE41:       # %bb.0:
1397; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1398; SSE41-NEXT:    retq
1399;
1400; AVX-LABEL: combine_test12:
1401; AVX:       # %bb.0:
1402; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1403; AVX-NEXT:    retq
1404  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1405  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1406  ret <4 x float> %2
1407}
1408
1409define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) {
1410; SSE-LABEL: combine_test13:
1411; SSE:       # %bb.0:
1412; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1413; SSE-NEXT:    retq
1414;
1415; AVX-LABEL: combine_test13:
1416; AVX:       # %bb.0:
1417; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1418; AVX-NEXT:    retq
1419  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1420  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1421  ret <4 x float> %2
1422}
1423
1424define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
1425; SSE-LABEL: combine_test14:
1426; SSE:       # %bb.0:
1427; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1428; SSE-NEXT:    retq
1429;
1430; AVX-LABEL: combine_test14:
1431; AVX:       # %bb.0:
1432; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1433; AVX-NEXT:    retq
1434  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1435  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1436  ret <4 x float> %2
1437}
1438
1439define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
1440; SSE2-LABEL: combine_test15:
1441; SSE2:       # %bb.0:
1442; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1443; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1444; SSE2-NEXT:    retq
1445;
1446; SSSE3-LABEL: combine_test15:
1447; SSSE3:       # %bb.0:
1448; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1449; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1450; SSSE3-NEXT:    retq
1451;
1452; SSE41-LABEL: combine_test15:
1453; SSE41:       # %bb.0:
1454; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1455; SSE41-NEXT:    retq
1456;
1457; AVX-LABEL: combine_test15:
1458; AVX:       # %bb.0:
1459; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1460; AVX-NEXT:    retq
1461  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1462  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1463  ret <4 x float> %2
1464}
1465
1466define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
1467; CHECK-LABEL: combine_test16:
1468; CHECK:       # %bb.0:
1469; CHECK-NEXT:    retq
1470  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1471  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1472  ret <4 x i32> %2
1473}
1474
1475define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
1476; SSE2-LABEL: combine_test17:
1477; SSE2:       # %bb.0:
1478; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1479; SSE2-NEXT:    movaps %xmm1, %xmm0
1480; SSE2-NEXT:    retq
1481;
1482; SSSE3-LABEL: combine_test17:
1483; SSSE3:       # %bb.0:
1484; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1485; SSSE3-NEXT:    movaps %xmm1, %xmm0
1486; SSSE3-NEXT:    retq
1487;
1488; SSE41-LABEL: combine_test17:
1489; SSE41:       # %bb.0:
1490; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1491; SSE41-NEXT:    retq
1492;
1493; AVX-LABEL: combine_test17:
1494; AVX:       # %bb.0:
1495; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1496; AVX-NEXT:    retq
1497  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1498  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1499  ret <4 x i32> %2
1500}
1501
1502define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) {
1503; SSE-LABEL: combine_test18:
1504; SSE:       # %bb.0:
1505; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1506; SSE-NEXT:    retq
1507;
1508; AVX-LABEL: combine_test18:
1509; AVX:       # %bb.0:
1510; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1511; AVX-NEXT:    retq
1512  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1513  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1514  ret <4 x i32> %2
1515}
1516
1517define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
1518; SSE-LABEL: combine_test19:
1519; SSE:       # %bb.0:
1520; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1521; SSE-NEXT:    retq
1522;
1523; AVX-LABEL: combine_test19:
1524; AVX:       # %bb.0:
1525; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1526; AVX-NEXT:    retq
1527  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1528  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1529  ret <4 x i32> %2
1530}
1531
1532define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
1533; SSE2-LABEL: combine_test20:
1534; SSE2:       # %bb.0:
1535; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1536; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1537; SSE2-NEXT:    retq
1538;
1539; SSSE3-LABEL: combine_test20:
1540; SSSE3:       # %bb.0:
1541; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1542; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1543; SSSE3-NEXT:    retq
1544;
1545; SSE41-LABEL: combine_test20:
1546; SSE41:       # %bb.0:
1547; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1548; SSE41-NEXT:    retq
1549;
1550; AVX-LABEL: combine_test20:
1551; AVX:       # %bb.0:
1552; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1553; AVX-NEXT:    retq
1554  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1555  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1556  ret <4 x i32> %2
1557}
1558
1559define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) {
1560; SSE-LABEL: combine_test21:
1561; SSE:       # %bb.0:
1562; SSE-NEXT:    movaps %xmm0, %xmm2
1563; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1564; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1565; SSE-NEXT:    movaps %xmm2, (%rdi)
1566; SSE-NEXT:    retq
1567;
1568; AVX-LABEL: combine_test21:
1569; AVX:       # %bb.0:
1570; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1571; AVX-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
1572; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1573; AVX-NEXT:    vmovaps %xmm2, (%rdi)
1574; AVX-NEXT:    vzeroupper
1575; AVX-NEXT:    retq
1576  %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1577  %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1578  store <4 x i32> %1, <4 x i32>* %ptr, align 16
1579  ret <4 x i32> %2
1580}
1581
1582define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) {
1583; SSE-LABEL: combine_test22:
1584; SSE:       # %bb.0:
1585; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1586; SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
1587; SSE-NEXT:    retq
1588;
1589; AVX-LABEL: combine_test22:
1590; AVX:       # %bb.0:
1591; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1592; AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
1593; AVX-NEXT:    retq
1594; Current AVX2 lowering of this is still awful, not adding a test case.
1595  %1 = load <2 x float>, <2 x float>* %a, align 8
1596  %2 = load <2 x float>, <2 x float>* %b, align 8
1597  %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1598  ret <8 x float> %3
1599}
1600
1601; PR22359
1602define void @combine_test23(<8 x float> %v, <2 x float>* %ptr) {
1603; SSE-LABEL: combine_test23:
1604; SSE:       # %bb.0:
1605; SSE-NEXT:    movups %xmm0, (%rdi)
1606; SSE-NEXT:    retq
1607;
1608; AVX-LABEL: combine_test23:
1609; AVX:       # %bb.0:
1610; AVX-NEXT:    vmovups %xmm0, (%rdi)
1611; AVX-NEXT:    vzeroupper
1612; AVX-NEXT:    retq
1613  %idx2 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 1
1614  %shuffle0 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 0, i32 1>
1615  %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 2, i32 3>
1616  store <2 x float> %shuffle0, <2 x float>* %ptr, align 8
1617  store <2 x float> %shuffle1, <2 x float>* %idx2, align 8
1618  ret void
1619}
1620
1621; Check some negative cases.
1622; FIXME: Do any of these really make sense? Are they redundant with the above tests?
1623
1624define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
1625; SSE-LABEL: combine_test1b:
1626; SSE:       # %bb.0:
1627; SSE-NEXT:    movaps %xmm1, %xmm0
1628; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
1629; SSE-NEXT:    retq
1630;
1631; AVX-LABEL: combine_test1b:
1632; AVX:       # %bb.0:
1633; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0]
1634; AVX-NEXT:    retq
1635  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1636  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0>
1637  ret <4 x float> %2
1638}
1639
1640define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
1641; SSE2-LABEL: combine_test2b:
1642; SSE2:       # %bb.0:
1643; SSE2-NEXT:    movaps %xmm1, %xmm0
1644; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1645; SSE2-NEXT:    retq
1646;
1647; SSSE3-LABEL: combine_test2b:
1648; SSSE3:       # %bb.0:
1649; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
1650; SSSE3-NEXT:    retq
1651;
1652; SSE41-LABEL: combine_test2b:
1653; SSE41:       # %bb.0:
1654; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
1655; SSE41-NEXT:    retq
1656;
1657; AVX-LABEL: combine_test2b:
1658; AVX:       # %bb.0:
1659; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm1[0,0]
1660; AVX-NEXT:    retq
1661  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1662  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5>
1663  ret <4 x float> %2
1664}
1665
1666define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
1667; SSE2-LABEL: combine_test3b:
1668; SSE2:       # %bb.0:
1669; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1670; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1671; SSE2-NEXT:    retq
1672;
1673; SSSE3-LABEL: combine_test3b:
1674; SSSE3:       # %bb.0:
1675; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1676; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1677; SSSE3-NEXT:    retq
1678;
1679; SSE41-LABEL: combine_test3b:
1680; SSE41:       # %bb.0:
1681; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1682; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1683; SSE41-NEXT:    retq
1684;
1685; AVX-LABEL: combine_test3b:
1686; AVX:       # %bb.0:
1687; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1688; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1689; AVX-NEXT:    retq
1690  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
1691  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>
1692  ret <4 x float> %2
1693}
1694
1695define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
1696; SSE-LABEL: combine_test4b:
1697; SSE:       # %bb.0:
1698; SSE-NEXT:    movaps %xmm1, %xmm0
1699; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
1700; SSE-NEXT:    retq
1701;
1702; AVX-LABEL: combine_test4b:
1703; AVX:       # %bb.0:
1704; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3]
1705; AVX-NEXT:    retq
1706  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1707  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7>
1708  ret <4 x float> %2
1709}
1710
1711
1712; Verify that we correctly fold shuffles even when we use illegal vector types.
1713
1714define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
1715; SSE2-LABEL: combine_test1c:
1716; SSE2:       # %bb.0:
1717; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1718; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1719; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1720; SSE2-NEXT:    andps %xmm0, %xmm2
1721; SSE2-NEXT:    andnps %xmm1, %xmm0
1722; SSE2-NEXT:    orps %xmm2, %xmm0
1723; SSE2-NEXT:    retq
1724;
1725; SSSE3-LABEL: combine_test1c:
1726; SSSE3:       # %bb.0:
1727; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1728; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1729; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1730; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
1731; SSSE3-NEXT:    retq
1732;
1733; SSE41-LABEL: combine_test1c:
1734; SSE41:       # %bb.0:
1735; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1736; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1737; SSE41-NEXT:    movaps {{.*#+}} xmm0 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
1738; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
1739; SSE41-NEXT:    movdqa %xmm1, %xmm0
1740; SSE41-NEXT:    retq
1741;
1742; AVX-LABEL: combine_test1c:
1743; AVX:       # %bb.0:
1744; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1745; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1746; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
1747; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1748; AVX-NEXT:    retq
1749  %A = load <4 x i8>, <4 x i8>* %a
1750  %B = load <4 x i8>, <4 x i8>* %b
1751  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1752  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1753  ret <4 x i8> %2
1754}
1755
1756define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
1757; SSE-LABEL: combine_test2c:
1758; SSE:       # %bb.0:
1759; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1760; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1761; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1762; SSE-NEXT:    retq
1763;
1764; AVX-LABEL: combine_test2c:
1765; AVX:       # %bb.0:
1766; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1767; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1768; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1769; AVX-NEXT:    retq
1770  %A = load <4 x i8>, <4 x i8>* %a
1771  %B = load <4 x i8>, <4 x i8>* %b
1772  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5>
1773  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1774  ret <4 x i8> %2
1775}
1776
1777define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
1778; SSE-LABEL: combine_test3c:
1779; SSE:       # %bb.0:
1780; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1781; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1782; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1783; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1784; SSE-NEXT:    retq
1785;
1786; AVX-LABEL: combine_test3c:
1787; AVX:       # %bb.0:
1788; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1789; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1790; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1791; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1792; AVX-NEXT:    retq
1793  %A = load <4 x i8>, <4 x i8>* %a
1794  %B = load <4 x i8>, <4 x i8>* %b
1795  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1796  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1797  ret <4 x i8> %2
1798}
1799
1800define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
1801; SSE2-LABEL: combine_test4c:
1802; SSE2:       # %bb.0:
1803; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1804; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1805; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1806; SSE2-NEXT:    andps %xmm0, %xmm2
1807; SSE2-NEXT:    andnps %xmm1, %xmm0
1808; SSE2-NEXT:    orps %xmm2, %xmm0
1809; SSE2-NEXT:    retq
1810;
1811; SSSE3-LABEL: combine_test4c:
1812; SSSE3:       # %bb.0:
1813; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1814; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1815; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1816; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,3,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
1817; SSSE3-NEXT:    retq
1818;
1819; SSE41-LABEL: combine_test4c:
1820; SSE41:       # %bb.0:
1821; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1822; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1823; SSE41-NEXT:    movaps {{.*#+}} xmm0 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
1824; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
1825; SSE41-NEXT:    movdqa %xmm1, %xmm0
1826; SSE41-NEXT:    retq
1827;
1828; AVX-LABEL: combine_test4c:
1829; AVX:       # %bb.0:
1830; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1831; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1832; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
1833; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1834; AVX-NEXT:    retq
1835  %A = load <4 x i8>, <4 x i8>* %a
1836  %B = load <4 x i8>, <4 x i8>* %b
1837  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1838  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1839  ret <4 x i8> %2
1840}
1841
1842
1843; The following test cases are generated from this C++ code
1844;
1845;__m128 blend_01(__m128 a, __m128 b)
1846;{
1847;  __m128 s = a;
1848;  s = _mm_blend_ps( s, b, 1<<0 );
1849;  s = _mm_blend_ps( s, b, 1<<1 );
1850;  return s;
1851;}
1852;
1853;__m128 blend_02(__m128 a, __m128 b)
1854;{
1855;  __m128 s = a;
1856;  s = _mm_blend_ps( s, b, 1<<0 );
1857;  s = _mm_blend_ps( s, b, 1<<2 );
1858;  return s;
1859;}
1860;
1861;__m128 blend_123(__m128 a, __m128 b)
1862;{
1863;  __m128 s = a;
1864;  s = _mm_blend_ps( s, b, 1<<1 );
1865;  s = _mm_blend_ps( s, b, 1<<2 );
1866;  s = _mm_blend_ps( s, b, 1<<3 );
1867;  return s;
1868;}
1869
1870; Ideally, we should collapse the following shuffles into a single one.
1871
1872define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
1873; SSE2-LABEL: combine_blend_01:
1874; SSE2:       # %bb.0:
1875; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1876; SSE2-NEXT:    retq
1877;
1878; SSSE3-LABEL: combine_blend_01:
1879; SSSE3:       # %bb.0:
1880; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1881; SSSE3-NEXT:    retq
1882;
1883; SSE41-LABEL: combine_blend_01:
1884; SSE41:       # %bb.0:
1885; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1886; SSE41-NEXT:    retq
1887;
1888; AVX-LABEL: combine_blend_01:
1889; AVX:       # %bb.0:
1890; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1891; AVX-NEXT:    retq
1892  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
1893  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1894  ret <4 x float> %shuffle6
1895}
1896
1897define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
1898; SSE2-LABEL: combine_blend_02:
1899; SSE2:       # %bb.0:
1900; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
1901; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
1902; SSE2-NEXT:    movaps %xmm1, %xmm0
1903; SSE2-NEXT:    retq
1904;
1905; SSSE3-LABEL: combine_blend_02:
1906; SSSE3:       # %bb.0:
1907; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
1908; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
1909; SSSE3-NEXT:    movaps %xmm1, %xmm0
1910; SSSE3-NEXT:    retq
1911;
1912; SSE41-LABEL: combine_blend_02:
1913; SSE41:       # %bb.0:
1914; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1915; SSE41-NEXT:    retq
1916;
1917; AVX-LABEL: combine_blend_02:
1918; AVX:       # %bb.0:
1919; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1920; AVX-NEXT:    retq
1921  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3>
1922  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1923  ret <4 x float> %shuffle6
1924}
1925
1926define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
1927; SSE2-LABEL: combine_blend_123:
1928; SSE2:       # %bb.0:
1929; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1930; SSE2-NEXT:    movaps %xmm1, %xmm0
1931; SSE2-NEXT:    retq
1932;
1933; SSSE3-LABEL: combine_blend_123:
1934; SSSE3:       # %bb.0:
1935; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1936; SSSE3-NEXT:    movaps %xmm1, %xmm0
1937; SSSE3-NEXT:    retq
1938;
1939; SSE41-LABEL: combine_blend_123:
1940; SSE41:       # %bb.0:
1941; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1942; SSE41-NEXT:    retq
1943;
1944; AVX-LABEL: combine_blend_123:
1945; AVX:       # %bb.0:
1946; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1947; AVX-NEXT:    retq
1948  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
1949  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
1950  %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1951  ret <4 x float> %shuffle12
1952}
1953
1954define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) {
1955; SSE-LABEL: combine_test_movhl_1:
1956; SSE:       # %bb.0:
1957; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1958; SSE-NEXT:    movaps %xmm1, %xmm0
1959; SSE-NEXT:    retq
1960;
1961; AVX-LABEL: combine_test_movhl_1:
1962; AVX:       # %bb.0:
1963; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1964; AVX-NEXT:    retq
1965  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3>
1966  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3>
1967  ret <4 x i32> %2
1968}
1969
1970define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) {
1971; SSE-LABEL: combine_test_movhl_2:
1972; SSE:       # %bb.0:
1973; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1974; SSE-NEXT:    movaps %xmm1, %xmm0
1975; SSE-NEXT:    retq
1976;
1977; AVX-LABEL: combine_test_movhl_2:
1978; AVX:       # %bb.0:
1979; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1980; AVX-NEXT:    retq
1981  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6>
1982  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2>
1983  ret <4 x i32> %2
1984}
1985
1986define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
1987; SSE-LABEL: combine_test_movhl_3:
1988; SSE:       # %bb.0:
1989; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1990; SSE-NEXT:    movaps %xmm1, %xmm0
1991; SSE-NEXT:    retq
1992;
1993; AVX-LABEL: combine_test_movhl_3:
1994; AVX:       # %bb.0:
1995; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1996; AVX-NEXT:    retq
1997  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2>
1998  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2>
1999  ret <4 x i32> %2
2000}
2001
2002
2003; Verify that we fold shuffles according to rule:
2004;  (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2)
2005
2006define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) {
2007; SSE2-LABEL: combine_undef_input_test1:
2008; SSE2:       # %bb.0:
2009; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2010; SSE2-NEXT:    retq
2011;
2012; SSSE3-LABEL: combine_undef_input_test1:
2013; SSSE3:       # %bb.0:
2014; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2015; SSSE3-NEXT:    retq
2016;
2017; SSE41-LABEL: combine_undef_input_test1:
2018; SSE41:       # %bb.0:
2019; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2020; SSE41-NEXT:    retq
2021;
2022; AVX-LABEL: combine_undef_input_test1:
2023; AVX:       # %bb.0:
2024; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2025; AVX-NEXT:    retq
2026  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2027  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2028  ret <4 x float> %2
2029}
2030
2031define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) {
2032; SSE-LABEL: combine_undef_input_test2:
2033; SSE:       # %bb.0:
2034; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2035; SSE-NEXT:    retq
2036;
2037; AVX-LABEL: combine_undef_input_test2:
2038; AVX:       # %bb.0:
2039; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2040; AVX-NEXT:    retq
2041  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2042  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2043  ret <4 x float> %2
2044}
2045
2046define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) {
2047; SSE-LABEL: combine_undef_input_test3:
2048; SSE:       # %bb.0:
2049; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2050; SSE-NEXT:    retq
2051;
2052; AVX-LABEL: combine_undef_input_test3:
2053; AVX:       # %bb.0:
2054; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2055; AVX-NEXT:    retq
2056  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2057  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2058  ret <4 x float> %2
2059}
2060
2061define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
2062; SSE-LABEL: combine_undef_input_test4:
2063; SSE:       # %bb.0:
2064; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2065; SSE-NEXT:    retq
2066;
2067; AVX-LABEL: combine_undef_input_test4:
2068; AVX:       # %bb.0:
2069; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2070; AVX-NEXT:    retq
2071  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2072  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2073  ret <4 x float> %2
2074}
2075
2076define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) {
2077; SSE2-LABEL: combine_undef_input_test5:
2078; SSE2:       # %bb.0:
2079; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2080; SSE2-NEXT:    retq
2081;
2082; SSSE3-LABEL: combine_undef_input_test5:
2083; SSSE3:       # %bb.0:
2084; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2085; SSSE3-NEXT:    retq
2086;
2087; SSE41-LABEL: combine_undef_input_test5:
2088; SSE41:       # %bb.0:
2089; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2090; SSE41-NEXT:    retq
2091;
2092; AVX-LABEL: combine_undef_input_test5:
2093; AVX:       # %bb.0:
2094; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2095; AVX-NEXT:    retq
2096  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2097  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2098  ret <4 x float> %2
2099}
2100
2101
2102; Verify that we fold shuffles according to rule:
2103;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2104
2105define <4 x float> @combine_undef_input_test6(<4 x float> %a) {
2106; CHECK-LABEL: combine_undef_input_test6:
2107; CHECK:       # %bb.0:
2108; CHECK-NEXT:    retq
2109  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2110  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2111  ret <4 x float> %2
2112}
2113
2114define <4 x float> @combine_undef_input_test7(<4 x float> %a) {
2115; SSE2-LABEL: combine_undef_input_test7:
2116; SSE2:       # %bb.0:
2117; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2118; SSE2-NEXT:    retq
2119;
2120; SSSE3-LABEL: combine_undef_input_test7:
2121; SSSE3:       # %bb.0:
2122; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2123; SSSE3-NEXT:    retq
2124;
2125; SSE41-LABEL: combine_undef_input_test7:
2126; SSE41:       # %bb.0:
2127; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2128; SSE41-NEXT:    retq
2129;
2130; AVX-LABEL: combine_undef_input_test7:
2131; AVX:       # %bb.0:
2132; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2133; AVX-NEXT:    retq
2134  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2135  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2136  ret <4 x float> %2
2137}
2138
2139define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
2140; SSE2-LABEL: combine_undef_input_test8:
2141; SSE2:       # %bb.0:
2142; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2143; SSE2-NEXT:    retq
2144;
2145; SSSE3-LABEL: combine_undef_input_test8:
2146; SSSE3:       # %bb.0:
2147; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2148; SSSE3-NEXT:    retq
2149;
2150; SSE41-LABEL: combine_undef_input_test8:
2151; SSE41:       # %bb.0:
2152; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2153; SSE41-NEXT:    retq
2154;
2155; AVX-LABEL: combine_undef_input_test8:
2156; AVX:       # %bb.0:
2157; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2158; AVX-NEXT:    retq
2159  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2160  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2161  ret <4 x float> %2
2162}
2163
2164define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
2165; SSE-LABEL: combine_undef_input_test9:
2166; SSE:       # %bb.0:
2167; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
2168; SSE-NEXT:    retq
2169;
2170; AVX-LABEL: combine_undef_input_test9:
2171; AVX:       # %bb.0:
2172; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
2173; AVX-NEXT:    retq
2174  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2175  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2176  ret <4 x float> %2
2177}
2178
2179define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
2180; CHECK-LABEL: combine_undef_input_test10:
2181; CHECK:       # %bb.0:
2182; CHECK-NEXT:    retq
2183  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2184  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2185  ret <4 x float> %2
2186}
2187
2188define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) {
2189; SSE2-LABEL: combine_undef_input_test11:
2190; SSE2:       # %bb.0:
2191; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2192; SSE2-NEXT:    retq
2193;
2194; SSSE3-LABEL: combine_undef_input_test11:
2195; SSSE3:       # %bb.0:
2196; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2197; SSSE3-NEXT:    retq
2198;
2199; SSE41-LABEL: combine_undef_input_test11:
2200; SSE41:       # %bb.0:
2201; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2202; SSE41-NEXT:    retq
2203;
2204; AVX-LABEL: combine_undef_input_test11:
2205; AVX:       # %bb.0:
2206; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2207; AVX-NEXT:    retq
2208  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2209  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6>
2210  ret <4 x float> %2
2211}
2212
2213define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) {
2214; SSE-LABEL: combine_undef_input_test12:
2215; SSE:       # %bb.0:
2216; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2217; SSE-NEXT:    retq
2218;
2219; AVX-LABEL: combine_undef_input_test12:
2220; AVX:       # %bb.0:
2221; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2222; AVX-NEXT:    retq
2223  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2224  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2225  ret <4 x float> %2
2226}
2227
2228define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) {
2229; SSE-LABEL: combine_undef_input_test13:
2230; SSE:       # %bb.0:
2231; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2232; SSE-NEXT:    retq
2233;
2234; AVX-LABEL: combine_undef_input_test13:
2235; AVX:       # %bb.0:
2236; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2237; AVX-NEXT:    retq
2238  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2239  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5>
2240  ret <4 x float> %2
2241}
2242
2243define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
2244; SSE-LABEL: combine_undef_input_test14:
2245; SSE:       # %bb.0:
2246; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2247; SSE-NEXT:    retq
2248;
2249; AVX-LABEL: combine_undef_input_test14:
2250; AVX:       # %bb.0:
2251; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2252; AVX-NEXT:    retq
2253  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2254  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2255  ret <4 x float> %2
2256}
2257
2258define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) {
2259; SSE2-LABEL: combine_undef_input_test15:
2260; SSE2:       # %bb.0:
2261; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2262; SSE2-NEXT:    retq
2263;
2264; SSSE3-LABEL: combine_undef_input_test15:
2265; SSSE3:       # %bb.0:
2266; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2267; SSSE3-NEXT:    retq
2268;
2269; SSE41-LABEL: combine_undef_input_test15:
2270; SSE41:       # %bb.0:
2271; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2272; SSE41-NEXT:    retq
2273;
2274; AVX-LABEL: combine_undef_input_test15:
2275; AVX:       # %bb.0:
2276; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2277; AVX-NEXT:    retq
2278  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2279  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2280  ret <4 x float> %2
2281}
2282
2283
2284; Verify that shuffles are canonicalized according to rules:
2285;  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
2286;
2287; This allows to trigger the following combine rule:
2288;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2289;
2290; As a result, all the shuffle pairs in each function below should be
2291; combined into a single legal shuffle operation.
2292
2293define <4 x float> @combine_undef_input_test16(<4 x float> %a) {
2294; CHECK-LABEL: combine_undef_input_test16:
2295; CHECK:       # %bb.0:
2296; CHECK-NEXT:    retq
2297  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2298  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
2299  ret <4 x float> %2
2300}
2301
2302define <4 x float> @combine_undef_input_test17(<4 x float> %a) {
2303; SSE2-LABEL: combine_undef_input_test17:
2304; SSE2:       # %bb.0:
2305; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2306; SSE2-NEXT:    retq
2307;
2308; SSSE3-LABEL: combine_undef_input_test17:
2309; SSSE3:       # %bb.0:
2310; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2311; SSSE3-NEXT:    retq
2312;
2313; SSE41-LABEL: combine_undef_input_test17:
2314; SSE41:       # %bb.0:
2315; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2316; SSE41-NEXT:    retq
2317;
2318; AVX-LABEL: combine_undef_input_test17:
2319; AVX:       # %bb.0:
2320; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2321; AVX-NEXT:    retq
2322  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2323  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2324  ret <4 x float> %2
2325}
2326
2327define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
2328; SSE2-LABEL: combine_undef_input_test18:
2329; SSE2:       # %bb.0:
2330; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2331; SSE2-NEXT:    retq
2332;
2333; SSSE3-LABEL: combine_undef_input_test18:
2334; SSSE3:       # %bb.0:
2335; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2336; SSSE3-NEXT:    retq
2337;
2338; SSE41-LABEL: combine_undef_input_test18:
2339; SSE41:       # %bb.0:
2340; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2341; SSE41-NEXT:    retq
2342;
2343; AVX-LABEL: combine_undef_input_test18:
2344; AVX:       # %bb.0:
2345; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2346; AVX-NEXT:    retq
2347  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2348  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
2349  ret <4 x float> %2
2350}
2351
2352define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
2353; SSE-LABEL: combine_undef_input_test19:
2354; SSE:       # %bb.0:
2355; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
2356; SSE-NEXT:    retq
2357;
2358; AVX-LABEL: combine_undef_input_test19:
2359; AVX:       # %bb.0:
2360; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
2361; AVX-NEXT:    retq
2362  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2363  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2364  ret <4 x float> %2
2365}
2366
2367define <4 x float> @combine_undef_input_test20(<4 x float> %a) {
2368; CHECK-LABEL: combine_undef_input_test20:
2369; CHECK:       # %bb.0:
2370; CHECK-NEXT:    retq
2371  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2372  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2373  ret <4 x float> %2
2374}
2375
2376; These tests are designed to test the ability to combine away unnecessary
2377; operations feeding into a shuffle. The AVX cases are the important ones as
2378; they leverage operations which cannot be done naturally on the entire vector
2379; and thus are decomposed into multiple smaller operations.
2380
2381define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
2382; SSE-LABEL: combine_unneeded_subvector1:
2383; SSE:       # %bb.0:
2384; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
2385; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0]
2386; SSE-NEXT:    movdqa %xmm0, %xmm1
2387; SSE-NEXT:    retq
2388;
2389; AVX1-LABEL: combine_unneeded_subvector1:
2390; AVX1:       # %bb.0:
2391; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2392; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
2393; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2394; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2395; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
2396; AVX1-NEXT:    retq
2397;
2398; AVX2-SLOW-LABEL: combine_unneeded_subvector1:
2399; AVX2-SLOW:       # %bb.0:
2400; AVX2-SLOW-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
2401; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2402; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
2403; AVX2-SLOW-NEXT:    retq
2404;
2405; AVX2-FAST-LABEL: combine_unneeded_subvector1:
2406; AVX2-FAST:       # %bb.0:
2407; AVX2-FAST-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
2408; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
2409; AVX2-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
2410; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
2411; AVX2-FAST-NEXT:    retq
2412  %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2413  %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
2414  ret <8 x i32> %c
2415}
2416
2417define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
2418; SSE-LABEL: combine_unneeded_subvector2:
2419; SSE:       # %bb.0:
2420; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
2421; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0]
2422; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
2423; SSE-NEXT:    retq
2424;
2425; AVX1-LABEL: combine_unneeded_subvector2:
2426; AVX1:       # %bb.0:
2427; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2428; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
2429; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2430; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2431; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2432; AVX1-NEXT:    retq
2433;
2434; AVX2-LABEL: combine_unneeded_subvector2:
2435; AVX2:       # %bb.0:
2436; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
2437; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2438; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2439; AVX2-NEXT:    retq
2440  %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2441  %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
2442  ret <8 x i32> %d
2443}
2444
2445define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
2446; SSE2-LABEL: combine_insertps1:
2447; SSE2:       # %bb.0:
2448; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2449; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2450; SSE2-NEXT:    movaps %xmm1, %xmm0
2451; SSE2-NEXT:    retq
2452;
2453; SSSE3-LABEL: combine_insertps1:
2454; SSSE3:       # %bb.0:
2455; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2456; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2457; SSSE3-NEXT:    movaps %xmm1, %xmm0
2458; SSSE3-NEXT:    retq
2459;
2460; SSE41-LABEL: combine_insertps1:
2461; SSE41:       # %bb.0:
2462; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2463; SSE41-NEXT:    retq
2464;
2465; AVX-LABEL: combine_insertps1:
2466; AVX:       # %bb.0:
2467; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2468; AVX-NEXT:    retq
2469
2470  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4>
2471  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
2472  ret <4 x float> %d
2473}
2474
2475define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
2476; SSE2-LABEL: combine_insertps2:
2477; SSE2:       # %bb.0:
2478; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2479; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2480; SSE2-NEXT:    movaps %xmm1, %xmm0
2481; SSE2-NEXT:    retq
2482;
2483; SSSE3-LABEL: combine_insertps2:
2484; SSSE3:       # %bb.0:
2485; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2486; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2487; SSSE3-NEXT:    movaps %xmm1, %xmm0
2488; SSSE3-NEXT:    retq
2489;
2490; SSE41-LABEL: combine_insertps2:
2491; SSE41:       # %bb.0:
2492; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2493; SSE41-NEXT:    retq
2494;
2495; AVX-LABEL: combine_insertps2:
2496; AVX:       # %bb.0:
2497; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2498; AVX-NEXT:    retq
2499
2500  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7>
2501  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2502  ret <4 x float> %d
2503}
2504
2505define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
2506; SSE2-LABEL: combine_insertps3:
2507; SSE2:       # %bb.0:
2508; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2509; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2510; SSE2-NEXT:    retq
2511;
2512; SSSE3-LABEL: combine_insertps3:
2513; SSSE3:       # %bb.0:
2514; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2515; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2516; SSSE3-NEXT:    retq
2517;
2518; SSE41-LABEL: combine_insertps3:
2519; SSE41:       # %bb.0:
2520; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2521; SSE41-NEXT:    retq
2522;
2523; AVX-LABEL: combine_insertps3:
2524; AVX:       # %bb.0:
2525; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2526; AVX-NEXT:    retq
2527
2528  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2529  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3>
2530  ret <4 x float> %d
2531}
2532
2533define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
2534; SSE2-LABEL: combine_insertps4:
2535; SSE2:       # %bb.0:
2536; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
2537; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2538; SSE2-NEXT:    retq
2539;
2540; SSSE3-LABEL: combine_insertps4:
2541; SSSE3:       # %bb.0:
2542; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
2543; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2544; SSSE3-NEXT:    retq
2545;
2546; SSE41-LABEL: combine_insertps4:
2547; SSE41:       # %bb.0:
2548; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2549; SSE41-NEXT:    retq
2550;
2551; AVX-LABEL: combine_insertps4:
2552; AVX:       # %bb.0:
2553; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2554; AVX-NEXT:    retq
2555
2556  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2557  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5>
2558  ret <4 x float> %d
2559}
2560
2561define void @combine_scalar_load_with_blend_with_zero(double* %a0, <4 x float>* %a1) {
2562; SSE-LABEL: combine_scalar_load_with_blend_with_zero:
2563; SSE:       # %bb.0:
2564; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2565; SSE-NEXT:    movaps %xmm0, (%rsi)
2566; SSE-NEXT:    retq
2567;
2568; AVX-LABEL: combine_scalar_load_with_blend_with_zero:
2569; AVX:       # %bb.0:
2570; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
2571; AVX-NEXT:    vmovaps %xmm0, (%rsi)
2572; AVX-NEXT:    retq
2573  %1 = load double, double* %a0, align 8
2574  %2 = insertelement <2 x double> undef, double %1, i32 0
2575  %3 = insertelement <2 x double> %2, double 0.000000e+00, i32 1
2576  %4 = bitcast <2 x double> %3 to <4 x float>
2577  %5 = shufflevector <4 x float> %4, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
2578  store <4 x float> %5, <4 x float>* %a1, align 16
2579  ret void
2580}
2581
2582; PR30371
2583define <4 x float> @combine_constant_insertion_v4f32(float %f) {
2584; SSE2-LABEL: combine_constant_insertion_v4f32:
2585; SSE2:       # %bb.0:
2586; SSE2-NEXT:    movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0>
2587; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2588; SSE2-NEXT:    movaps %xmm1, %xmm0
2589; SSE2-NEXT:    retq
2590;
2591; SSSE3-LABEL: combine_constant_insertion_v4f32:
2592; SSSE3:       # %bb.0:
2593; SSSE3-NEXT:    movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0>
2594; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2595; SSSE3-NEXT:    movaps %xmm1, %xmm0
2596; SSSE3-NEXT:    retq
2597;
2598; SSE41-LABEL: combine_constant_insertion_v4f32:
2599; SSE41:       # %bb.0:
2600; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
2601; SSE41-NEXT:    retq
2602;
2603; AVX-LABEL: combine_constant_insertion_v4f32:
2604; AVX:       # %bb.0:
2605; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
2606; AVX-NEXT:    retq
2607  %a0 = insertelement <4 x float> undef, float %f, i32 0
2608  %ret = shufflevector <4 x float> %a0, <4 x float> <float undef, float 4.0, float 5.0, float 3.0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2609  ret <4 x float> %ret
2610}
2611
2612define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) {
2613; SSE2-LABEL: combine_constant_insertion_v4i32:
2614; SSE2:       # %bb.0:
2615; SSE2-NEXT:    movd %edi, %xmm1
2616; SSE2-NEXT:    movaps {{.*#+}} xmm0 = <u,4,5,30>
2617; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2618; SSE2-NEXT:    retq
2619;
2620; SSSE3-LABEL: combine_constant_insertion_v4i32:
2621; SSSE3:       # %bb.0:
2622; SSSE3-NEXT:    movd %edi, %xmm1
2623; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = <u,4,5,30>
2624; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2625; SSSE3-NEXT:    retq
2626;
2627; SSE41-LABEL: combine_constant_insertion_v4i32:
2628; SSE41:       # %bb.0:
2629; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = <u,4,5,30>
2630; SSE41-NEXT:    pinsrd $0, %edi, %xmm0
2631; SSE41-NEXT:    retq
2632;
2633; AVX-LABEL: combine_constant_insertion_v4i32:
2634; AVX:       # %bb.0:
2635; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,4,5,30>
2636; AVX-NEXT:    vpinsrd $0, %edi, %xmm0, %xmm0
2637; AVX-NEXT:    retq
2638  %a0 = insertelement <4 x i32> undef, i32 %f, i32 0
2639  %ret = shufflevector <4 x i32> %a0, <4 x i32> <i32 undef, i32 4, i32 5, i32 30>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2640  ret <4 x i32> %ret
2641}
2642
2643define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
2644; SSE2-LABEL: PR22377:
2645; SSE2:       # %bb.0: # %entry
2646; SSE2-NEXT:    movaps %xmm0, %xmm1
2647; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[2,3]
2648; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
2649; SSE2-NEXT:    addps %xmm0, %xmm1
2650; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2651; SSE2-NEXT:    retq
2652;
2653; SSSE3-LABEL: PR22377:
2654; SSSE3:       # %bb.0: # %entry
2655; SSSE3-NEXT:    movaps %xmm0, %xmm1
2656; SSSE3-NEXT:    haddps %xmm0, %xmm1
2657; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
2658; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
2659; SSSE3-NEXT:    retq
2660;
2661; SSE41-LABEL: PR22377:
2662; SSE41:       # %bb.0: # %entry
2663; SSE41-NEXT:    movaps %xmm0, %xmm1
2664; SSE41-NEXT:    haddps %xmm0, %xmm1
2665; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
2666; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
2667; SSE41-NEXT:    retq
2668;
2669; AVX-LABEL: PR22377:
2670; AVX:       # %bb.0: # %entry
2671; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm1
2672; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
2673; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
2674; AVX-NEXT:    retq
2675entry:
2676  %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
2677  %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2678  %r2 = fadd <4 x float> %s1, %s2
2679  %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2680  ret <4 x float> %s3
2681}
2682
2683define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) {
2684; SSE2-LABEL: PR22390:
2685; SSE2:       # %bb.0: # %entry
2686; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2687; SSE2-NEXT:    movaps %xmm0, %xmm2
2688; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2689; SSE2-NEXT:    addps %xmm0, %xmm2
2690; SSE2-NEXT:    movaps %xmm2, %xmm0
2691; SSE2-NEXT:    retq
2692;
2693; SSSE3-LABEL: PR22390:
2694; SSSE3:       # %bb.0: # %entry
2695; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2696; SSSE3-NEXT:    movaps %xmm0, %xmm2
2697; SSSE3-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2698; SSSE3-NEXT:    addps %xmm0, %xmm2
2699; SSSE3-NEXT:    movaps %xmm2, %xmm0
2700; SSSE3-NEXT:    retq
2701;
2702; SSE41-LABEL: PR22390:
2703; SSE41:       # %bb.0: # %entry
2704; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2705; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2706; SSE41-NEXT:    addps %xmm1, %xmm0
2707; SSE41-NEXT:    retq
2708;
2709; AVX-LABEL: PR22390:
2710; AVX:       # %bb.0: # %entry
2711; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2712; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2713; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2714; AVX-NEXT:    retq
2715entry:
2716  %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
2717  %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
2718  %r2 = fadd <4 x float> %s1, %s2
2719  ret <4 x float> %r2
2720}
2721
2722define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
2723; SSE-LABEL: PR22412:
2724; SSE:       # %bb.0: # %entry
2725; SSE-NEXT:    movaps %xmm3, %xmm1
2726; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
2727; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[3,2]
2728; SSE-NEXT:    retq
2729;
2730; AVX1-LABEL: PR22412:
2731; AVX1:       # %bb.0: # %entry
2732; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
2733; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2734; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm2[3,2],ymm0[5,4],ymm2[7,6]
2735; AVX1-NEXT:    retq
2736;
2737; AVX2-LABEL: PR22412:
2738; AVX2:       # %bb.0: # %entry
2739; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2740; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1]
2741; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6]
2742; AVX2-NEXT:    retq
2743entry:
2744  %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2745  %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2>
2746  ret <8 x float> %s2
2747}
2748
2749define <4 x float> @PR30264(<4 x float> %x) {
2750; SSE2-LABEL: PR30264:
2751; SSE2:       # %bb.0:
2752; SSE2-NEXT:    xorps %xmm1, %xmm1
2753; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2754; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3]
2755; SSE2-NEXT:    movaps %xmm1, %xmm0
2756; SSE2-NEXT:    retq
2757;
2758; SSSE3-LABEL: PR30264:
2759; SSSE3:       # %bb.0:
2760; SSSE3-NEXT:    xorps %xmm1, %xmm1
2761; SSSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2762; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3]
2763; SSSE3-NEXT:    movaps %xmm1, %xmm0
2764; SSSE3-NEXT:    retq
2765;
2766; SSE41-LABEL: PR30264:
2767; SSE41:       # %bb.0:
2768; SSE41-NEXT:    movaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0>
2769; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm0[0],zero,xmm1[2,3]
2770; SSE41-NEXT:    movaps %xmm1, %xmm0
2771; SSE41-NEXT:    retq
2772;
2773; AVX-LABEL: PR30264:
2774; AVX:       # %bb.0:
2775; AVX-NEXT:    vmovaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0>
2776; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2,3]
2777; AVX-NEXT:    retq
2778  %shuf1 = shufflevector <4 x float> %x, <4 x float> <float undef, float 0.0, float undef, float undef>, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
2779  %shuf2 = shufflevector <4 x float> %shuf1, <4 x float> <float undef, float undef, float 4.0, float 1.0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2780  ret <4 x float> %shuf2
2781}
2782
2783define <8 x i16> @PR39549(<16 x i8> %x) {
2784; SSE-LABEL: PR39549:
2785; SSE:       # %bb.0:
2786; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2787; SSE-NEXT:    psraw $8, %xmm0
2788; SSE-NEXT:    retq
2789;
2790; AVX-LABEL: PR39549:
2791; AVX:       # %bb.0:
2792; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2793; AVX-NEXT:    vpsraw $8, %xmm0, %xmm0
2794; AVX-NEXT:    retq
2795  %a = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 8, i32 undef, i32 9, i32 undef, i32 10, i32 undef, i32 11, i32 undef, i32 12, i32 undef, i32 13, i32 undef, i32 14, i32 undef, i32 15, i32 undef>
2796  %b = bitcast <16 x i8> %a to <8 x i16>
2797  %c = shl <8 x i16> %b, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
2798  %d = ashr <8 x i16> %c, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
2799  ret <8 x i16> %d
2800}
2801
2802define <4 x i32> @PR41545(<4 x i32> %a0, <16 x i8> %a1) {
2803; SSE-LABEL: PR41545:
2804; SSE:       # %bb.0:
2805; SSE-NEXT:    paddd %xmm1, %xmm0
2806; SSE-NEXT:    retq
2807;
2808; AVX-LABEL: PR41545:
2809; AVX:       # %bb.0:
2810; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2811; AVX-NEXT:    retq
2812  %1  = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
2813  %2  = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
2814  %3  = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
2815  %4  = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
2816  %5  = zext <4 x i8> %1 to <4 x i32>
2817  %6  = zext <4 x i8> %2 to <4 x i32>
2818  %7  = zext <4 x i8> %3 to <4 x i32>
2819  %8  = zext <4 x i8> %4 to <4 x i32>
2820  %9  = shl <4 x i32> %6, <i32 8, i32 8, i32 8, i32 8>
2821  %10 = shl <4 x i32> %7, <i32 16, i32 16, i32 16, i32 16>
2822  %11 = shl <4 x i32> %8, <i32 24, i32 24, i32 24, i32 24>
2823  %12 = or <4 x i32> %5, %9
2824  %13 = or <4 x i32> %12, %10
2825  %14 = or <4 x i32> %13, %11
2826  %15 = add <4 x i32> %a0, %14
2827  ret <4 x i32> %15
2828}
2829
2830define <8 x i16> @shuffle_extract_insert(<8 x i16> %a) {
2831; SSE-LABEL: shuffle_extract_insert:
2832; SSE:       # %bb.0:
2833; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2834; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
2835; SSE-NEXT:    retq
2836;
2837; AVX1-LABEL: shuffle_extract_insert:
2838; AVX1:       # %bb.0:
2839; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2840; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
2841; AVX1-NEXT:    retq
2842;
2843; AVX2-SLOW-LABEL: shuffle_extract_insert:
2844; AVX2-SLOW:       # %bb.0:
2845; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2846; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
2847; AVX2-SLOW-NEXT:    retq
2848;
2849; AVX2-FAST-LABEL: shuffle_extract_insert:
2850; AVX2-FAST:       # %bb.0:
2851; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,12,13,10,11,8,9,14,15]
2852; AVX2-FAST-NEXT:    retq
2853  %a0 = extractelement <8 x i16> %a, i32 0
2854  %a1 = extractelement <8 x i16> %a, i32 1
2855  %a3 = extractelement <8 x i16> %a, i32 3
2856  %a4 = extractelement <8 x i16> %a, i32 4
2857  %a5 = extractelement <8 x i16> %a, i32 5
2858  %a6 = extractelement <8 x i16> %a, i32 6
2859  %a7 = extractelement <8 x i16> %a, i32 7
2860  %1 = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2861  %2 = insertelement <8 x i16> %1, i16 %a1, i32 1
2862  %3 = insertelement <8 x i16> %2, i16 %a0, i32 2
2863  %4 = insertelement <8 x i16> %3, i16 %a3, i32 3
2864  %5 = insertelement <8 x i16> %4, i16 %a6, i32 4
2865  %6 = insertelement <8 x i16> %5, i16 %a5, i32 5
2866  %7 = insertelement <8 x i16> %6, i16 %a4, i32 6
2867  %8 = insertelement <8 x i16> %7, i16 %a7, i32 7
2868  ret <8 x i16> %8
2869}
2870
2871define <8 x i16> @shuffle_extract_insert_double(<8 x i16> %a, <8 x i16> %b) {
2872; SSE2-LABEL: shuffle_extract_insert_double:
2873; SSE2:       # %bb.0:
2874; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
2875; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
2876; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2877; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
2878; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2879; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2880; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2881; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
2882; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2883; SSE2-NEXT:    retq
2884;
2885; SSSE3-LABEL: shuffle_extract_insert_double:
2886; SSSE3:       # %bb.0:
2887; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2888; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2889; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2890; SSSE3-NEXT:    retq
2891;
2892; SSE41-LABEL: shuffle_extract_insert_double:
2893; SSE41:       # %bb.0:
2894; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2895; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2896; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2897; SSE41-NEXT:    retq
2898;
2899; AVX-LABEL: shuffle_extract_insert_double:
2900; AVX:       # %bb.0:
2901; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2902; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2903; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2904; AVX-NEXT:    retq
2905  %a0 = extractelement <8 x i16> %a, i32 0
2906  %a4 = extractelement <8 x i16> %a, i32 4
2907  %a6 = extractelement <8 x i16> %a, i32 6
2908  %b11 = extractelement <8 x i16> %b, i32 3
2909  %b13 = extractelement <8 x i16> %b, i32 5
2910  %b15 = extractelement <8 x i16> %b, i32 7
2911  %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2912  %2 = insertelement <8 x i16> %1, i16 %a0, i32 2
2913  %3 = insertelement <8 x i16> %2, i16 %b11, i32 3
2914  %4 = insertelement <8 x i16> %3, i16 %a6, i32 4
2915  %5 = insertelement <8 x i16> %4, i16 %b13, i32 5
2916  %6 = insertelement <8 x i16> %5, i16 %a4, i32 6
2917  %7 = insertelement <8 x i16> %6, i16 %b15, i32 7
2918  ret <8 x i16> %7
2919}
2920
2921define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i16> %b) {
2922; SSE2-LABEL: shuffle_extract_concat_insert:
2923; SSE2:       # %bb.0:
2924; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2925; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2926; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2927; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2928; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
2929; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7]
2930; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
2931; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2932; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
2933; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2934; SSE2-NEXT:    retq
2935;
2936; SSSE3-LABEL: shuffle_extract_concat_insert:
2937; SSSE3:       # %bb.0:
2938; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2939; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2940; SSSE3-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2941; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2942; SSSE3-NEXT:    retq
2943;
2944; SSE41-LABEL: shuffle_extract_concat_insert:
2945; SSE41:       # %bb.0:
2946; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2947; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2948; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2949; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2950; SSE41-NEXT:    retq
2951;
2952; AVX-LABEL: shuffle_extract_concat_insert:
2953; AVX:       # %bb.0:
2954; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2955; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2956; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2957; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2958; AVX-NEXT:    retq
2959  %a = shufflevector <4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2960  %a0 = extractelement <8 x i16> %a, i32 0
2961  %a4 = extractelement <8 x i16> %a, i32 4
2962  %a6 = extractelement <8 x i16> %a, i32 6
2963  %b11 = extractelement <8 x i16> %b, i32 3
2964  %b13 = extractelement <8 x i16> %b, i32 5
2965  %b15 = extractelement <8 x i16> %b, i32 7
2966  %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2967  %2 = insertelement <8 x i16> %1, i16 %a0, i32 2
2968  %3 = insertelement <8 x i16> %2, i16 %b11, i32 3
2969  %4 = insertelement <8 x i16> %3, i16 %a6, i32 4
2970  %5 = insertelement <8 x i16> %4, i16 %b13, i32 5
2971  %6 = insertelement <8 x i16> %5, i16 %a4, i32 6
2972  %7 = insertelement <8 x i16> %6, i16 %b15, i32 7
2973  ret <8 x i16> %7
2974}
2975
2976define <8 x i16> @shuffle_scalar_to_vector_extract(<8 x i8>* %p0, i8* %p1, i8* %p2) {
2977; SSE2-LABEL: shuffle_scalar_to_vector_extract:
2978; SSE2:       # %bb.0:
2979; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2980; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2981; SSE2-NEXT:    psraw $8, %xmm1
2982; SSE2-NEXT:    pextrw $7, %xmm1, %eax
2983; SSE2-NEXT:    movd %eax, %xmm2
2984; SSE2-NEXT:    movsbl (%rsi), %eax
2985; SSE2-NEXT:    movd %eax, %xmm0
2986; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2987; SSE2-NEXT:    movsbl (%rdx), %eax
2988; SSE2-NEXT:    movd %eax, %xmm0
2989; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2990; SSE2-NEXT:    pxor %xmm0, %xmm0
2991; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2992; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2993; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2994; SSE2-NEXT:    retq
2995;
2996; SSSE3-LABEL: shuffle_scalar_to_vector_extract:
2997; SSSE3:       # %bb.0:
2998; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2999; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3000; SSSE3-NEXT:    psraw $8, %xmm1
3001; SSSE3-NEXT:    movsbl (%rsi), %eax
3002; SSSE3-NEXT:    movd %eax, %xmm2
3003; SSSE3-NEXT:    palignr {{.*#+}} xmm2 = xmm1[14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
3004; SSSE3-NEXT:    movsbl (%rdx), %eax
3005; SSSE3-NEXT:    movd %eax, %xmm0
3006; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3007; SSSE3-NEXT:    pxor %xmm0, %xmm0
3008; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3009; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3010; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3011; SSSE3-NEXT:    retq
3012;
3013; SSE41-LABEL: shuffle_scalar_to_vector_extract:
3014; SSE41:       # %bb.0:
3015; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
3016; SSE41-NEXT:    pextrw $4, %xmm0, %eax
3017; SSE41-NEXT:    pextrw $7, %xmm0, %ecx
3018; SSE41-NEXT:    pxor %xmm0, %xmm0
3019; SSE41-NEXT:    pinsrw $1, %eax, %xmm0
3020; SSE41-NEXT:    movl $65531, %eax # imm = 0xFFFB
3021; SSE41-NEXT:    pinsrw $2, %eax, %xmm0
3022; SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
3023; SSE41-NEXT:    movsbl (%rsi), %eax
3024; SSE41-NEXT:    pinsrw $5, %eax, %xmm0
3025; SSE41-NEXT:    movsbl (%rdx), %eax
3026; SSE41-NEXT:    pinsrw $6, %eax, %xmm0
3027; SSE41-NEXT:    retq
3028;
3029; AVX-LABEL: shuffle_scalar_to_vector_extract:
3030; AVX:       # %bb.0:
3031; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
3032; AVX-NEXT:    vpextrw $4, %xmm0, %eax
3033; AVX-NEXT:    vpextrw $7, %xmm0, %ecx
3034; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3035; AVX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
3036; AVX-NEXT:    movl $65531, %eax # imm = 0xFFFB
3037; AVX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
3038; AVX-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
3039; AVX-NEXT:    movsbl (%rsi), %eax
3040; AVX-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
3041; AVX-NEXT:    movsbl (%rdx), %eax
3042; AVX-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
3043; AVX-NEXT:    retq
3044  %tmp = load <8 x i8>, <8 x i8>* %p0, align 1
3045  %tmp1 = sext <8 x i8> %tmp to <8 x i16>
3046  %tmp2 = load i8, i8* %p1, align 1
3047  %cvt1 = sext i8 %tmp2 to i16
3048  %tmp3 = load i8, i8* %p2, align 1
3049  %cvt2 = sext i8 %tmp3 to i16
3050  %tmp4 = extractelement <8 x i16> %tmp1, i32 4
3051  %tmp5 = extractelement <8 x i16> %tmp1, i32 7
3052  %tmp6 = insertelement <8 x i16> <i16 undef, i16 undef, i16 -5, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 undef, i32 0
3053  %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp4, i32 1
3054  %tmp8 = insertelement <8 x i16> %tmp7, i16 undef, i32 3
3055  %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp5, i32 4
3056  %tmp10 = insertelement <8 x i16> %tmp9, i16 %cvt1, i32 5
3057  %tmp11 = insertelement <8 x i16> %tmp10, i16 %cvt2, i32 6
3058  %tmp12 = insertelement <8 x i16> %tmp11, i16 undef, i32 7
3059  %tmp13 = shufflevector <8 x i16> %tmp12, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
3060  ret <8 x i16> %tmp13
3061}
3062
3063define void @PR43024() {
3064; SSE-LABEL: PR43024:
3065; SSE:       # %bb.0:
3066; SSE-NEXT:    movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3067; SSE-NEXT:    movaps %xmm0, (%rax)
3068; SSE-NEXT:    addss {{.*}}(%rip), %xmm0
3069; SSE-NEXT:    xorps %xmm1, %xmm1
3070; SSE-NEXT:    addss %xmm1, %xmm0
3071; SSE-NEXT:    addss %xmm1, %xmm0
3072; SSE-NEXT:    movss %xmm0, (%rax)
3073; SSE-NEXT:    retq
3074;
3075; AVX-LABEL: PR43024:
3076; AVX:       # %bb.0:
3077; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3078; AVX-NEXT:    vmovaps %xmm0, (%rax)
3079; AVX-NEXT:    vaddss {{\.LCPI.*}}+{{.*}}(%rip), %xmm0, %xmm0
3080; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
3081; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
3082; AVX-NEXT:    vaddss {{\.LCPI.*}}+{{.*}}(%rip), %xmm0, %xmm0
3083; AVX-NEXT:    vmovss %xmm0, (%rax)
3084; AVX-NEXT:    retq
3085  store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, <4 x float>* undef, align 16
3086  %1 = load <4 x float>, <4 x float>* undef, align 16
3087  %2 = fmul <4 x float> %1, <float 0x0, float 0x0, float 0x0, float 0x0>
3088  %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
3089  %4 = fadd <4 x float> %2, %3
3090  %5 = fadd <4 x float> zeroinitializer, %4
3091  %6 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
3092  %7 = fadd <4 x float> %6, %5
3093  %8 = extractelement <4 x float> %7, i32 0
3094  store float %8, float* undef, align 8
3095  ret void
3096}
3097
3098define void @PR45604(<32 x i16>* %dst, <8 x i16>* %src) {
3099; SSE2-LABEL: PR45604:
3100; SSE2:       # %bb.0:
3101; SSE2-NEXT:    movdqa (%rsi), %xmm1
3102; SSE2-NEXT:    movd %xmm1, %eax
3103; SSE2-NEXT:    movzwl %ax, %eax
3104; SSE2-NEXT:    movd %eax, %xmm0
3105; SSE2-NEXT:    movl $11, %eax
3106; SSE2-NEXT:    pinsrw $2, %eax, %xmm0
3107; SSE2-NEXT:    pextrw $1, %xmm1, %ecx
3108; SSE2-NEXT:    pinsrw $4, %ecx, %xmm0
3109; SSE2-NEXT:    pinsrw $6, %eax, %xmm0
3110; SSE2-NEXT:    pextrw $2, %xmm1, %ecx
3111; SSE2-NEXT:    movd %ecx, %xmm2
3112; SSE2-NEXT:    pinsrw $2, %eax, %xmm2
3113; SSE2-NEXT:    pextrw $3, %xmm1, %ecx
3114; SSE2-NEXT:    pinsrw $4, %ecx, %xmm2
3115; SSE2-NEXT:    pinsrw $6, %eax, %xmm2
3116; SSE2-NEXT:    pextrw $4, %xmm1, %ecx
3117; SSE2-NEXT:    movd %ecx, %xmm3
3118; SSE2-NEXT:    pinsrw $2, %eax, %xmm3
3119; SSE2-NEXT:    pextrw $5, %xmm1, %ecx
3120; SSE2-NEXT:    pinsrw $4, %ecx, %xmm3
3121; SSE2-NEXT:    pinsrw $6, %eax, %xmm3
3122; SSE2-NEXT:    pextrw $6, %xmm1, %ecx
3123; SSE2-NEXT:    movd %ecx, %xmm4
3124; SSE2-NEXT:    pinsrw $2, %eax, %xmm4
3125; SSE2-NEXT:    pextrw $7, %xmm1, %ecx
3126; SSE2-NEXT:    pinsrw $4, %ecx, %xmm4
3127; SSE2-NEXT:    pinsrw $6, %eax, %xmm4
3128; SSE2-NEXT:    movdqa %xmm4, 48(%rdi)
3129; SSE2-NEXT:    movdqa %xmm3, 32(%rdi)
3130; SSE2-NEXT:    movdqa %xmm2, 16(%rdi)
3131; SSE2-NEXT:    movdqa %xmm0, (%rdi)
3132; SSE2-NEXT:    retq
3133;
3134; SSSE3-LABEL: PR45604:
3135; SSSE3:       # %bb.0:
3136; SSSE3-NEXT:    movdqa (%rsi), %xmm1
3137; SSSE3-NEXT:    movd %xmm1, %eax
3138; SSSE3-NEXT:    movzwl %ax, %eax
3139; SSSE3-NEXT:    movd %eax, %xmm0
3140; SSSE3-NEXT:    movl $11, %eax
3141; SSSE3-NEXT:    pinsrw $2, %eax, %xmm0
3142; SSSE3-NEXT:    pextrw $1, %xmm1, %ecx
3143; SSSE3-NEXT:    pinsrw $4, %ecx, %xmm0
3144; SSSE3-NEXT:    pinsrw $6, %eax, %xmm0
3145; SSSE3-NEXT:    pextrw $2, %xmm1, %ecx
3146; SSSE3-NEXT:    movd %ecx, %xmm2
3147; SSSE3-NEXT:    pinsrw $2, %eax, %xmm2
3148; SSSE3-NEXT:    pextrw $3, %xmm1, %ecx
3149; SSSE3-NEXT:    pinsrw $4, %ecx, %xmm2
3150; SSSE3-NEXT:    pinsrw $6, %eax, %xmm2
3151; SSSE3-NEXT:    pextrw $4, %xmm1, %ecx
3152; SSSE3-NEXT:    movd %ecx, %xmm3
3153; SSSE3-NEXT:    pinsrw $2, %eax, %xmm3
3154; SSSE3-NEXT:    pextrw $5, %xmm1, %ecx
3155; SSSE3-NEXT:    pinsrw $4, %ecx, %xmm3
3156; SSSE3-NEXT:    pinsrw $6, %eax, %xmm3
3157; SSSE3-NEXT:    pextrw $6, %xmm1, %ecx
3158; SSSE3-NEXT:    movd %ecx, %xmm4
3159; SSSE3-NEXT:    pinsrw $2, %eax, %xmm4
3160; SSSE3-NEXT:    pextrw $7, %xmm1, %ecx
3161; SSSE3-NEXT:    pinsrw $4, %ecx, %xmm4
3162; SSSE3-NEXT:    pinsrw $6, %eax, %xmm4
3163; SSSE3-NEXT:    movdqa %xmm4, 48(%rdi)
3164; SSSE3-NEXT:    movdqa %xmm3, 32(%rdi)
3165; SSSE3-NEXT:    movdqa %xmm2, 16(%rdi)
3166; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
3167; SSSE3-NEXT:    retq
3168;
3169; SSE41-LABEL: PR45604:
3170; SSE41:       # %bb.0:
3171; SSE41-NEXT:    movdqa (%rsi), %xmm1
3172; SSE41-NEXT:    pextrw $2, %xmm1, %eax
3173; SSE41-NEXT:    movd %eax, %xmm0
3174; SSE41-NEXT:    movl $11, %eax
3175; SSE41-NEXT:    pinsrw $2, %eax, %xmm0
3176; SSE41-NEXT:    pextrw $3, %xmm1, %ecx
3177; SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
3178; SSE41-NEXT:    pinsrw $6, %eax, %xmm0
3179; SSE41-NEXT:    pextrw $4, %xmm1, %ecx
3180; SSE41-NEXT:    movd %ecx, %xmm2
3181; SSE41-NEXT:    pinsrw $2, %eax, %xmm2
3182; SSE41-NEXT:    pextrw $5, %xmm1, %ecx
3183; SSE41-NEXT:    pinsrw $4, %ecx, %xmm2
3184; SSE41-NEXT:    pinsrw $6, %eax, %xmm2
3185; SSE41-NEXT:    pextrw $6, %xmm1, %ecx
3186; SSE41-NEXT:    movd %ecx, %xmm3
3187; SSE41-NEXT:    pinsrw $2, %eax, %xmm3
3188; SSE41-NEXT:    pextrw $7, %xmm1, %ecx
3189; SSE41-NEXT:    pinsrw $4, %ecx, %xmm3
3190; SSE41-NEXT:    pinsrw $6, %eax, %xmm3
3191; SSE41-NEXT:    pxor %xmm4, %xmm4
3192; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3,4,5,6,7]
3193; SSE41-NEXT:    pinsrw $2, %eax, %xmm4
3194; SSE41-NEXT:    pextrw $1, %xmm1, %ecx
3195; SSE41-NEXT:    pinsrw $4, %ecx, %xmm4
3196; SSE41-NEXT:    pinsrw $6, %eax, %xmm4
3197; SSE41-NEXT:    movdqa %xmm4, (%rdi)
3198; SSE41-NEXT:    movdqa %xmm3, 48(%rdi)
3199; SSE41-NEXT:    movdqa %xmm2, 32(%rdi)
3200; SSE41-NEXT:    movdqa %xmm0, 16(%rdi)
3201; SSE41-NEXT:    retq
3202;
3203; AVX1-LABEL: PR45604:
3204; AVX1:       # %bb.0:
3205; AVX1-NEXT:    vmovdqa (%rsi), %xmm0
3206; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
3207; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3208; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [11,11,11,0,11,11,11,0]
3209; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
3210; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
3211; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3212; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
3213; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
3214; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
3215; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3216; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
3217; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3218; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
3219; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
3220; AVX1-NEXT:    vmovups %ymm0, (%rdi)
3221; AVX1-NEXT:    vmovups %ymm1, 32(%rdi)
3222; AVX1-NEXT:    vzeroupper
3223; AVX1-NEXT:    retq
3224;
3225; AVX2-LABEL: PR45604:
3226; AVX2:       # %bb.0:
3227; AVX2-NEXT:    vmovdqa (%rsi), %xmm0
3228; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,2,0,2]
3229; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u>
3230; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3231; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0>
3232; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7]
3233; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
3234; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3235; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
3236; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
3237; AVX2-NEXT:    vmovdqu %ymm1, (%rdi)
3238; AVX2-NEXT:    vzeroupper
3239; AVX2-NEXT:    retq
3240  %v1 = load <8 x i16>, <8 x i16>* %src, align 16
3241  %v2 = shufflevector <8 x i16> %v1, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3242  %v3 = shufflevector <16 x i16> %v2, <16 x i16> <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
3243  store <32 x i16> %v3, <32 x i16>* %dst, align 16
3244  ret void
3245}
3246