1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3
4; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
5; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
6; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
7; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW
8; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
9; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL
10
11define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) {
12; SSE-LABEL: shuffle_v4i32_0001:
13; SSE:       # %bb.0:
14; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
15; SSE-NEXT:    retq
16;
17; AVX-LABEL: shuffle_v4i32_0001:
18; AVX:       # %bb.0:
19; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1]
20; AVX-NEXT:    retq
21  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
22  ret <4 x i32> %shuffle
23}
24define <4 x i32> @shuffle_v4i32_0020(<4 x i32> %a, <4 x i32> %b) {
25; SSE-LABEL: shuffle_v4i32_0020:
26; SSE:       # %bb.0:
27; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,0]
28; SSE-NEXT:    retq
29;
30; AVX-LABEL: shuffle_v4i32_0020:
31; AVX:       # %bb.0:
32; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0]
33; AVX-NEXT:    retq
34  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
35  ret <4 x i32> %shuffle
36}
37define <4 x i32> @shuffle_v4i32_0112(<4 x i32> %a, <4 x i32> %b) {
38; SSE-LABEL: shuffle_v4i32_0112:
39; SSE:       # %bb.0:
40; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
41; SSE-NEXT:    retq
42;
43; AVX-LABEL: shuffle_v4i32_0112:
44; AVX:       # %bb.0:
45; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
46; AVX-NEXT:    retq
47  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
48  ret <4 x i32> %shuffle
49}
50define <4 x i32> @shuffle_v4i32_0300(<4 x i32> %a, <4 x i32> %b) {
51; SSE-LABEL: shuffle_v4i32_0300:
52; SSE:       # %bb.0:
53; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,0,0]
54; SSE-NEXT:    retq
55;
56; AVX-LABEL: shuffle_v4i32_0300:
57; AVX:       # %bb.0:
58; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0]
59; AVX-NEXT:    retq
60  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
61  ret <4 x i32> %shuffle
62}
63define <4 x i32> @shuffle_v4i32_1000(<4 x i32> %a, <4 x i32> %b) {
64; SSE-LABEL: shuffle_v4i32_1000:
65; SSE:       # %bb.0:
66; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
67; SSE-NEXT:    retq
68;
69; AVX-LABEL: shuffle_v4i32_1000:
70; AVX:       # %bb.0:
71; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
72; AVX-NEXT:    retq
73  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
74  ret <4 x i32> %shuffle
75}
76define <4 x i32> @shuffle_v4i32_2200(<4 x i32> %a, <4 x i32> %b) {
77; SSE-LABEL: shuffle_v4i32_2200:
78; SSE:       # %bb.0:
79; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,0,0]
80; SSE-NEXT:    retq
81;
82; AVX-LABEL: shuffle_v4i32_2200:
83; AVX:       # %bb.0:
84; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0]
85; AVX-NEXT:    retq
86  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
87  ret <4 x i32> %shuffle
88}
89define <4 x i32> @shuffle_v4i32_3330(<4 x i32> %a, <4 x i32> %b) {
90; SSE-LABEL: shuffle_v4i32_3330:
91; SSE:       # %bb.0:
92; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,0]
93; SSE-NEXT:    retq
94;
95; AVX-LABEL: shuffle_v4i32_3330:
96; AVX:       # %bb.0:
97; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0]
98; AVX-NEXT:    retq
99  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
100  ret <4 x i32> %shuffle
101}
102define <4 x i32> @shuffle_v4i32_3210(<4 x i32> %a, <4 x i32> %b) {
103; SSE-LABEL: shuffle_v4i32_3210:
104; SSE:       # %bb.0:
105; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
106; SSE-NEXT:    retq
107;
108; AVX-LABEL: shuffle_v4i32_3210:
109; AVX:       # %bb.0:
110; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
111; AVX-NEXT:    retq
112  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
113  ret <4 x i32> %shuffle
114}
115
116define <4 x i32> @shuffle_v4i32_2121(<4 x i32> %a, <4 x i32> %b) {
117; SSE-LABEL: shuffle_v4i32_2121:
118; SSE:       # %bb.0:
119; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,1]
120; SSE-NEXT:    retq
121;
122; AVX-LABEL: shuffle_v4i32_2121:
123; AVX:       # %bb.0:
124; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,1]
125; AVX-NEXT:    retq
126  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 1, i32 2, i32 1>
127  ret <4 x i32> %shuffle
128}
129
130define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) {
131; SSE-LABEL: shuffle_v4f32_0001:
132; SSE:       # %bb.0:
133; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,1]
134; SSE-NEXT:    retq
135;
136; AVX-LABEL: shuffle_v4f32_0001:
137; AVX:       # %bb.0:
138; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1]
139; AVX-NEXT:    retq
140  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
141  ret <4 x float> %shuffle
142}
143define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) {
144; SSE-LABEL: shuffle_v4f32_0020:
145; SSE:       # %bb.0:
146; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,2,0]
147; SSE-NEXT:    retq
148;
149; AVX-LABEL: shuffle_v4f32_0020:
150; AVX:       # %bb.0:
151; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0]
152; AVX-NEXT:    retq
153  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
154  ret <4 x float> %shuffle
155}
156define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) {
157; SSE-LABEL: shuffle_v4f32_0300:
158; SSE:       # %bb.0:
159; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3,0,0]
160; SSE-NEXT:    retq
161;
162; AVX-LABEL: shuffle_v4f32_0300:
163; AVX:       # %bb.0:
164; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0]
165; AVX-NEXT:    retq
166  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
167  ret <4 x float> %shuffle
168}
169define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) {
170; SSE-LABEL: shuffle_v4f32_1000:
171; SSE:       # %bb.0:
172; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
173; SSE-NEXT:    retq
174;
175; AVX-LABEL: shuffle_v4f32_1000:
176; AVX:       # %bb.0:
177; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
178; AVX-NEXT:    retq
179  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
180  ret <4 x float> %shuffle
181}
182define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) {
183; SSE-LABEL: shuffle_v4f32_2200:
184; SSE:       # %bb.0:
185; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,2,0,0]
186; SSE-NEXT:    retq
187;
188; AVX-LABEL: shuffle_v4f32_2200:
189; AVX:       # %bb.0:
190; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0]
191; AVX-NEXT:    retq
192  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
193  ret <4 x float> %shuffle
194}
195define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) {
196; SSE-LABEL: shuffle_v4f32_3330:
197; SSE:       # %bb.0:
198; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,0]
199; SSE-NEXT:    retq
200;
201; AVX-LABEL: shuffle_v4f32_3330:
202; AVX:       # %bb.0:
203; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0]
204; AVX-NEXT:    retq
205  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
206  ret <4 x float> %shuffle
207}
208define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) {
209; SSE-LABEL: shuffle_v4f32_3210:
210; SSE:       # %bb.0:
211; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
212; SSE-NEXT:    retq
213;
214; AVX-LABEL: shuffle_v4f32_3210:
215; AVX:       # %bb.0:
216; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
217; AVX-NEXT:    retq
218  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
219  ret <4 x float> %shuffle
220}
221define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) {
222; SSE-LABEL: shuffle_v4f32_0011:
223; SSE:       # %bb.0:
224; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
225; SSE-NEXT:    retq
226;
227; AVX-LABEL: shuffle_v4f32_0011:
228; AVX:       # %bb.0:
229; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
230; AVX-NEXT:    retq
231  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
232  ret <4 x float> %shuffle
233}
234define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) {
235; SSE-LABEL: shuffle_v4f32_2233:
236; SSE:       # %bb.0:
237; SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
238; SSE-NEXT:    retq
239;
240; AVX-LABEL: shuffle_v4f32_2233:
241; AVX:       # %bb.0:
242; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
243; AVX-NEXT:    retq
244  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
245  ret <4 x float> %shuffle
246}
247define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) {
248; SSE2-LABEL: shuffle_v4f32_0022:
249; SSE2:       # %bb.0:
250; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,2,2]
251; SSE2-NEXT:    retq
252;
253; SSE3-LABEL: shuffle_v4f32_0022:
254; SSE3:       # %bb.0:
255; SSE3-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
256; SSE3-NEXT:    retq
257;
258; SSSE3-LABEL: shuffle_v4f32_0022:
259; SSSE3:       # %bb.0:
260; SSSE3-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
261; SSSE3-NEXT:    retq
262;
263; SSE41-LABEL: shuffle_v4f32_0022:
264; SSE41:       # %bb.0:
265; SSE41-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
266; SSE41-NEXT:    retq
267;
268; AVX-LABEL: shuffle_v4f32_0022:
269; AVX:       # %bb.0:
270; AVX-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
271; AVX-NEXT:    retq
272  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
273  ret <4 x float> %shuffle
274}
275define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
276; SSE2-LABEL: shuffle_v4f32_1133:
277; SSE2:       # %bb.0:
278; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
279; SSE2-NEXT:    retq
280;
281; SSE3-LABEL: shuffle_v4f32_1133:
282; SSE3:       # %bb.0:
283; SSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
284; SSE3-NEXT:    retq
285;
286; SSSE3-LABEL: shuffle_v4f32_1133:
287; SSSE3:       # %bb.0:
288; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
289; SSSE3-NEXT:    retq
290;
291; SSE41-LABEL: shuffle_v4f32_1133:
292; SSE41:       # %bb.0:
293; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
294; SSE41-NEXT:    retq
295;
296; AVX-LABEL: shuffle_v4f32_1133:
297; AVX:       # %bb.0:
298; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
299; AVX-NEXT:    retq
300  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
301  ret <4 x float> %shuffle
302}
303
304define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {
305; SSE-LABEL: shuffle_v4f32_0145:
306; SSE:       # %bb.0:
307; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
308; SSE-NEXT:    retq
309;
310; AVX-LABEL: shuffle_v4f32_0145:
311; AVX:       # %bb.0:
312; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
313; AVX-NEXT:    retq
314  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
315  ret <4 x float> %shuffle
316}
317
318define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) {
319; SSE-LABEL: shuffle_v4f32_6723:
320; SSE:       # %bb.0:
321; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
322; SSE-NEXT:    retq
323;
324; AVX-LABEL: shuffle_v4f32_6723:
325; AVX:       # %bb.0:
326; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
327; AVX-NEXT:    retq
328  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
329  ret <4 x float> %shuffle
330}
331
332define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
333; SSE2-LABEL: shuffle_v4i32_0124:
334; SSE2:       # %bb.0:
335; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
336; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
337; SSE2-NEXT:    retq
338;
339; SSE3-LABEL: shuffle_v4i32_0124:
340; SSE3:       # %bb.0:
341; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
342; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
343; SSE3-NEXT:    retq
344;
345; SSSE3-LABEL: shuffle_v4i32_0124:
346; SSSE3:       # %bb.0:
347; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
348; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
349; SSSE3-NEXT:    retq
350;
351; SSE41-LABEL: shuffle_v4i32_0124:
352; SSE41:       # %bb.0:
353; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
354; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
355; SSE41-NEXT:    retq
356;
357; AVX1-LABEL: shuffle_v4i32_0124:
358; AVX1:       # %bb.0:
359; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
360; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
361; AVX1-NEXT:    retq
362;
363; AVX2-LABEL: shuffle_v4i32_0124:
364; AVX2:       # %bb.0:
365; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
366; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
367; AVX2-NEXT:    retq
368;
369; AVX512VL-LABEL: shuffle_v4i32_0124:
370; AVX512VL:       # %bb.0:
371; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,4]
372; AVX512VL-NEXT:    vpermt2d %xmm1, %xmm2, %xmm0
373; AVX512VL-NEXT:    retq
374  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
375  ret <4 x i32> %shuffle
376}
377define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) {
378; SSE2-LABEL: shuffle_v4i32_0142:
379; SSE2:       # %bb.0:
380; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
381; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
382; SSE2-NEXT:    retq
383;
384; SSE3-LABEL: shuffle_v4i32_0142:
385; SSE3:       # %bb.0:
386; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
387; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
388; SSE3-NEXT:    retq
389;
390; SSSE3-LABEL: shuffle_v4i32_0142:
391; SSSE3:       # %bb.0:
392; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
393; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
394; SSSE3-NEXT:    retq
395;
396; SSE41-LABEL: shuffle_v4i32_0142:
397; SSE41:       # %bb.0:
398; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
399; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
400; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
401; SSE41-NEXT:    retq
402;
403; AVX1-LABEL: shuffle_v4i32_0142:
404; AVX1:       # %bb.0:
405; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
406; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2]
407; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
408; AVX1-NEXT:    retq
409;
410; AVX2-LABEL: shuffle_v4i32_0142:
411; AVX2:       # %bb.0:
412; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
413; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2]
414; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
415; AVX2-NEXT:    retq
416;
417; AVX512VL-LABEL: shuffle_v4i32_0142:
418; AVX512VL:       # %bb.0:
419; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,2]
420; AVX512VL-NEXT:    vpermt2d %xmm1, %xmm2, %xmm0
421; AVX512VL-NEXT:    retq
422  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
423  ret <4 x i32> %shuffle
424}
425define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) {
426; SSE2-LABEL: shuffle_v4i32_0412:
427; SSE2:       # %bb.0:
428; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
429; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
430; SSE2-NEXT:    movaps %xmm1, %xmm0
431; SSE2-NEXT:    retq
432;
433; SSE3-LABEL: shuffle_v4i32_0412:
434; SSE3:       # %bb.0:
435; SSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
436; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
437; SSE3-NEXT:    movaps %xmm1, %xmm0
438; SSE3-NEXT:    retq
439;
440; SSSE3-LABEL: shuffle_v4i32_0412:
441; SSSE3:       # %bb.0:
442; SSSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
443; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
444; SSSE3-NEXT:    movaps %xmm1, %xmm0
445; SSSE3-NEXT:    retq
446;
447; SSE41-LABEL: shuffle_v4i32_0412:
448; SSE41:       # %bb.0:
449; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
450; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
451; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
452; SSE41-NEXT:    retq
453;
454; AVX1-LABEL: shuffle_v4i32_0412:
455; AVX1:       # %bb.0:
456; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
457; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
458; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
459; AVX1-NEXT:    retq
460;
461; AVX2-LABEL: shuffle_v4i32_0412:
462; AVX2:       # %bb.0:
463; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
464; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
465; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
466; AVX2-NEXT:    retq
467;
468; AVX512VL-LABEL: shuffle_v4i32_0412:
469; AVX512VL:       # %bb.0:
470; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,4,1,2]
471; AVX512VL-NEXT:    vpermt2d %xmm1, %xmm2, %xmm0
472; AVX512VL-NEXT:    retq
473  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
474  ret <4 x i32> %shuffle
475}
476define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) {
477; SSE2-LABEL: shuffle_v4i32_4012:
478; SSE2:       # %bb.0:
479; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
480; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
481; SSE2-NEXT:    movaps %xmm1, %xmm0
482; SSE2-NEXT:    retq
483;
484; SSE3-LABEL: shuffle_v4i32_4012:
485; SSE3:       # %bb.0:
486; SSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
487; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
488; SSE3-NEXT:    movaps %xmm1, %xmm0
489; SSE3-NEXT:    retq
490;
491; SSSE3-LABEL: shuffle_v4i32_4012:
492; SSSE3:       # %bb.0:
493; SSSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
494; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
495; SSSE3-NEXT:    movaps %xmm1, %xmm0
496; SSSE3-NEXT:    retq
497;
498; SSE41-LABEL: shuffle_v4i32_4012:
499; SSE41:       # %bb.0:
500; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,2]
501; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
502; SSE41-NEXT:    retq
503;
504; AVX1OR2-LABEL: shuffle_v4i32_4012:
505; AVX1OR2:       # %bb.0:
506; AVX1OR2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,2]
507; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
508; AVX1OR2-NEXT:    retq
509;
510; AVX512VL-LABEL: shuffle_v4i32_4012:
511; AVX512VL:       # %bb.0:
512; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,0,1,2]
513; AVX512VL-NEXT:    vpermt2d %xmm1, %xmm2, %xmm0
514; AVX512VL-NEXT:    retq
515  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
516  ret <4 x i32> %shuffle
517}
518define <4 x i32> @shuffle_v4i32_0145(<4 x i32> %a, <4 x i32> %b) {
519; SSE-LABEL: shuffle_v4i32_0145:
520; SSE:       # %bb.0:
521; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
522; SSE-NEXT:    retq
523;
524; AVX-LABEL: shuffle_v4i32_0145:
525; AVX:       # %bb.0:
526; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
527; AVX-NEXT:    retq
528  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
529  ret <4 x i32> %shuffle
530}
531define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) {
532; SSE2-LABEL: shuffle_v4i32_0451:
533; SSE2:       # %bb.0:
534; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
535; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
536; SSE2-NEXT:    retq
537;
538; SSE3-LABEL: shuffle_v4i32_0451:
539; SSE3:       # %bb.0:
540; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
541; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
542; SSE3-NEXT:    retq
543;
544; SSSE3-LABEL: shuffle_v4i32_0451:
545; SSSE3:       # %bb.0:
546; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
547; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
548; SSSE3-NEXT:    retq
549;
550; SSE41-LABEL: shuffle_v4i32_0451:
551; SSE41:       # %bb.0:
552; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
553; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
554; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
555; SSE41-NEXT:    retq
556;
557; AVX1-LABEL: shuffle_v4i32_0451:
558; AVX1:       # %bb.0:
559; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
560; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
561; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
562; AVX1-NEXT:    retq
563;
564; AVX2-LABEL: shuffle_v4i32_0451:
565; AVX2:       # %bb.0:
566; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
567; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
568; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
569; AVX2-NEXT:    retq
570;
571; AVX512VL-LABEL: shuffle_v4i32_0451:
572; AVX512VL:       # %bb.0:
573; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,4,5,1]
574; AVX512VL-NEXT:    vpermt2d %xmm1, %xmm2, %xmm0
575; AVX512VL-NEXT:    retq
576  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
577  ret <4 x i32> %shuffle
578}
579define <4 x i32> @shuffle_v4i32_4501(<4 x i32> %a, <4 x i32> %b) {
580; SSE-LABEL: shuffle_v4i32_4501:
581; SSE:       # %bb.0:
582; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
583; SSE-NEXT:    movaps %xmm1, %xmm0
584; SSE-NEXT:    retq
585;
586; AVX-LABEL: shuffle_v4i32_4501:
587; AVX:       # %bb.0:
588; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
589; AVX-NEXT:    retq
590  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
591  ret <4 x i32> %shuffle
592}
593define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) {
594; SSE2-LABEL: shuffle_v4i32_4015:
595; SSE2:       # %bb.0:
596; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
597; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
598; SSE2-NEXT:    retq
599;
600; SSE3-LABEL: shuffle_v4i32_4015:
601; SSE3:       # %bb.0:
602; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
603; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
604; SSE3-NEXT:    retq
605;
606; SSSE3-LABEL: shuffle_v4i32_4015:
607; SSSE3:       # %bb.0:
608; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
609; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
610; SSSE3-NEXT:    retq
611;
612; SSE41-LABEL: shuffle_v4i32_4015:
613; SSE41:       # %bb.0:
614; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
615; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
616; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
617; SSE41-NEXT:    retq
618;
619; AVX1-LABEL: shuffle_v4i32_4015:
620; AVX1:       # %bb.0:
621; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
622; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
623; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
624; AVX1-NEXT:    retq
625;
626; AVX2-LABEL: shuffle_v4i32_4015:
627; AVX2:       # %bb.0:
628; AVX2-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
629; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
630; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
631; AVX2-NEXT:    retq
632;
633; AVX512VL-LABEL: shuffle_v4i32_4015:
634; AVX512VL:       # %bb.0:
635; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,0,1,5]
636; AVX512VL-NEXT:    vpermt2d %xmm1, %xmm2, %xmm0
637; AVX512VL-NEXT:    retq
638  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
639  ret <4 x i32> %shuffle
640}
641
642define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
643; SSE2-LABEL: shuffle_v4f32_4zzz:
644; SSE2:       # %bb.0:
645; SSE2-NEXT:    xorps %xmm1, %xmm1
646; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
647; SSE2-NEXT:    movaps %xmm1, %xmm0
648; SSE2-NEXT:    retq
649;
650; SSE3-LABEL: shuffle_v4f32_4zzz:
651; SSE3:       # %bb.0:
652; SSE3-NEXT:    xorps %xmm1, %xmm1
653; SSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
654; SSE3-NEXT:    movaps %xmm1, %xmm0
655; SSE3-NEXT:    retq
656;
657; SSSE3-LABEL: shuffle_v4f32_4zzz:
658; SSSE3:       # %bb.0:
659; SSSE3-NEXT:    xorps %xmm1, %xmm1
660; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
661; SSSE3-NEXT:    movaps %xmm1, %xmm0
662; SSSE3-NEXT:    retq
663;
664; SSE41-LABEL: shuffle_v4f32_4zzz:
665; SSE41:       # %bb.0:
666; SSE41-NEXT:    xorps %xmm1, %xmm1
667; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
668; SSE41-NEXT:    retq
669;
670; AVX-LABEL: shuffle_v4f32_4zzz:
671; AVX:       # %bb.0:
672; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
673; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
674; AVX-NEXT:    retq
675  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
676  ret <4 x float> %shuffle
677}
678
679define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
680; SSE2-LABEL: shuffle_v4f32_z4zz:
681; SSE2:       # %bb.0:
682; SSE2-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
683; SSE2-NEXT:    xorps %xmm1, %xmm1
684; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
685; SSE2-NEXT:    retq
686;
687; SSE3-LABEL: shuffle_v4f32_z4zz:
688; SSE3:       # %bb.0:
689; SSE3-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
690; SSE3-NEXT:    xorps %xmm1, %xmm1
691; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
692; SSE3-NEXT:    retq
693;
694; SSSE3-LABEL: shuffle_v4f32_z4zz:
695; SSSE3:       # %bb.0:
696; SSSE3-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
697; SSSE3-NEXT:    xorps %xmm1, %xmm1
698; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
699; SSSE3-NEXT:    retq
700;
701; SSE41-LABEL: shuffle_v4f32_z4zz:
702; SSE41:       # %bb.0:
703; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
704; SSE41-NEXT:    retq
705;
706; AVX-LABEL: shuffle_v4f32_z4zz:
707; AVX:       # %bb.0:
708; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
709; AVX-NEXT:    retq
710  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
711  ret <4 x float> %shuffle
712}
713
714define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
715; SSE2-LABEL: shuffle_v4f32_zz4z:
716; SSE2:       # %bb.0:
717; SSE2-NEXT:    movq {{.*#+}} xmm1 = xmm0[0],zero
718; SSE2-NEXT:    pxor %xmm0, %xmm0
719; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
720; SSE2-NEXT:    retq
721;
722; SSE3-LABEL: shuffle_v4f32_zz4z:
723; SSE3:       # %bb.0:
724; SSE3-NEXT:    movq {{.*#+}} xmm1 = xmm0[0],zero
725; SSE3-NEXT:    pxor %xmm0, %xmm0
726; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
727; SSE3-NEXT:    retq
728;
729; SSSE3-LABEL: shuffle_v4f32_zz4z:
730; SSSE3:       # %bb.0:
731; SSSE3-NEXT:    movq {{.*#+}} xmm1 = xmm0[0],zero
732; SSSE3-NEXT:    pxor %xmm0, %xmm0
733; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
734; SSSE3-NEXT:    retq
735;
736; SSE41-LABEL: shuffle_v4f32_zz4z:
737; SSE41:       # %bb.0:
738; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero
739; SSE41-NEXT:    retq
740;
741; AVX-LABEL: shuffle_v4f32_zz4z:
742; AVX:       # %bb.0:
743; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero
744; AVX-NEXT:    retq
745  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
746  ret <4 x float> %shuffle
747}
748
749define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
750; SSE2-LABEL: shuffle_v4f32_zuu4:
751; SSE2:       # %bb.0:
752; SSE2-NEXT:    xorps %xmm1, %xmm1
753; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
754; SSE2-NEXT:    movaps %xmm1, %xmm0
755; SSE2-NEXT:    retq
756;
757; SSE3-LABEL: shuffle_v4f32_zuu4:
758; SSE3:       # %bb.0:
759; SSE3-NEXT:    xorps %xmm1, %xmm1
760; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
761; SSE3-NEXT:    movaps %xmm1, %xmm0
762; SSE3-NEXT:    retq
763;
764; SSSE3-LABEL: shuffle_v4f32_zuu4:
765; SSSE3:       # %bb.0:
766; SSSE3-NEXT:    xorps %xmm1, %xmm1
767; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
768; SSSE3-NEXT:    movaps %xmm1, %xmm0
769; SSSE3-NEXT:    retq
770;
771; SSE41-LABEL: shuffle_v4f32_zuu4:
772; SSE41:       # %bb.0:
773; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0]
774; SSE41-NEXT:    retq
775;
776; AVX-LABEL: shuffle_v4f32_zuu4:
777; AVX:       # %bb.0:
778; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0]
779; AVX-NEXT:    retq
780  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
781  ret <4 x float> %shuffle
782}
783
784define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
785; SSE2-LABEL: shuffle_v4f32_zzz7:
786; SSE2:       # %bb.0:
787; SSE2-NEXT:    xorps %xmm1, %xmm1
788; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
789; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
790; SSE2-NEXT:    movaps %xmm1, %xmm0
791; SSE2-NEXT:    retq
792;
793; SSE3-LABEL: shuffle_v4f32_zzz7:
794; SSE3:       # %bb.0:
795; SSE3-NEXT:    xorps %xmm1, %xmm1
796; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
797; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
798; SSE3-NEXT:    movaps %xmm1, %xmm0
799; SSE3-NEXT:    retq
800;
801; SSSE3-LABEL: shuffle_v4f32_zzz7:
802; SSSE3:       # %bb.0:
803; SSSE3-NEXT:    xorps %xmm1, %xmm1
804; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
805; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
806; SSSE3-NEXT:    movaps %xmm1, %xmm0
807; SSSE3-NEXT:    retq
808;
809; SSE41-LABEL: shuffle_v4f32_zzz7:
810; SSE41:       # %bb.0:
811; SSE41-NEXT:    xorps %xmm1, %xmm1
812; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
813; SSE41-NEXT:    retq
814;
815; AVX-LABEL: shuffle_v4f32_zzz7:
816; AVX:       # %bb.0:
817; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
818; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
819; AVX-NEXT:    retq
820  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
821  ret <4 x float> %shuffle
822}
823
824define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
825; SSE2-LABEL: shuffle_v4f32_z6zz:
826; SSE2:       # %bb.0:
827; SSE2-NEXT:    xorps %xmm1, %xmm1
828; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
829; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
830; SSE2-NEXT:    retq
831;
832; SSE3-LABEL: shuffle_v4f32_z6zz:
833; SSE3:       # %bb.0:
834; SSE3-NEXT:    xorps %xmm1, %xmm1
835; SSE3-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
836; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
837; SSE3-NEXT:    retq
838;
839; SSSE3-LABEL: shuffle_v4f32_z6zz:
840; SSSE3:       # %bb.0:
841; SSSE3-NEXT:    xorps %xmm1, %xmm1
842; SSSE3-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
843; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
844; SSSE3-NEXT:    retq
845;
846; SSE41-LABEL: shuffle_v4f32_z6zz:
847; SSE41:       # %bb.0:
848; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
849; SSE41-NEXT:    retq
850;
851; AVX-LABEL: shuffle_v4f32_z6zz:
852; AVX:       # %bb.0:
853; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
854; AVX-NEXT:    retq
855  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
856  ret <4 x float> %shuffle
857}
858
859define <4 x float> @shuffle_v4f32_0z23(<4 x float> %a) {
860; SSE2-LABEL: shuffle_v4f32_0z23:
861; SSE2:       # %bb.0:
862; SSE2-NEXT:    xorps %xmm1, %xmm1
863; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
864; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
865; SSE2-NEXT:    movaps %xmm1, %xmm0
866; SSE2-NEXT:    retq
867;
868; SSE3-LABEL: shuffle_v4f32_0z23:
869; SSE3:       # %bb.0:
870; SSE3-NEXT:    xorps %xmm1, %xmm1
871; SSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
872; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
873; SSE3-NEXT:    movaps %xmm1, %xmm0
874; SSE3-NEXT:    retq
875;
876; SSSE3-LABEL: shuffle_v4f32_0z23:
877; SSSE3:       # %bb.0:
878; SSSE3-NEXT:    xorps %xmm1, %xmm1
879; SSSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
880; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
881; SSSE3-NEXT:    movaps %xmm1, %xmm0
882; SSSE3-NEXT:    retq
883;
884; SSE41-LABEL: shuffle_v4f32_0z23:
885; SSE41:       # %bb.0:
886; SSE41-NEXT:    xorps %xmm1, %xmm1
887; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
888; SSE41-NEXT:    retq
889;
890; AVX-LABEL: shuffle_v4f32_0z23:
891; AVX:       # %bb.0:
892; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
893; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
894; AVX-NEXT:    retq
895  %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
896  ret <4 x float> %shuffle
897}
898
899define <4 x float> @shuffle_v4f32_01z3(<4 x float> %a) {
900; SSE2-LABEL: shuffle_v4f32_01z3:
901; SSE2:       # %bb.0:
902; SSE2-NEXT:    xorps %xmm1, %xmm1
903; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
904; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
905; SSE2-NEXT:    retq
906;
907; SSE3-LABEL: shuffle_v4f32_01z3:
908; SSE3:       # %bb.0:
909; SSE3-NEXT:    xorps %xmm1, %xmm1
910; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
911; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
912; SSE3-NEXT:    retq
913;
914; SSSE3-LABEL: shuffle_v4f32_01z3:
915; SSSE3:       # %bb.0:
916; SSSE3-NEXT:    xorps %xmm1, %xmm1
917; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
918; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
919; SSSE3-NEXT:    retq
920;
921; SSE41-LABEL: shuffle_v4f32_01z3:
922; SSE41:       # %bb.0:
923; SSE41-NEXT:    xorps %xmm1, %xmm1
924; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
925; SSE41-NEXT:    retq
926;
927; AVX-LABEL: shuffle_v4f32_01z3:
928; AVX:       # %bb.0:
929; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
930; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
931; AVX-NEXT:    retq
932  %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
933  ret <4 x float> %shuffle
934}
935
936define <4 x float> @shuffle_v4f32_012z(<4 x float> %a) {
937; SSE2-LABEL: shuffle_v4f32_012z:
938; SSE2:       # %bb.0:
939; SSE2-NEXT:    xorps %xmm1, %xmm1
940; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
941; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
942; SSE2-NEXT:    retq
943;
944; SSE3-LABEL: shuffle_v4f32_012z:
945; SSE3:       # %bb.0:
946; SSE3-NEXT:    xorps %xmm1, %xmm1
947; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
948; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
949; SSE3-NEXT:    retq
950;
951; SSSE3-LABEL: shuffle_v4f32_012z:
952; SSSE3:       # %bb.0:
953; SSSE3-NEXT:    xorps %xmm1, %xmm1
954; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
955; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
956; SSSE3-NEXT:    retq
957;
958; SSE41-LABEL: shuffle_v4f32_012z:
959; SSE41:       # %bb.0:
960; SSE41-NEXT:    xorps %xmm1, %xmm1
961; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
962; SSE41-NEXT:    retq
963;
964; AVX-LABEL: shuffle_v4f32_012z:
965; AVX:       # %bb.0:
966; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
967; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
968; AVX-NEXT:    retq
969  %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
970  ret <4 x float> %shuffle
971}
972
973define <4 x float> @shuffle_v4f32_0zz3(<4 x float> %a) {
974; SSE2-LABEL: shuffle_v4f32_0zz3:
975; SSE2:       # %bb.0:
976; SSE2-NEXT:    xorps %xmm1, %xmm1
977; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
978; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
979; SSE2-NEXT:    retq
980;
981; SSE3-LABEL: shuffle_v4f32_0zz3:
982; SSE3:       # %bb.0:
983; SSE3-NEXT:    xorps %xmm1, %xmm1
984; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
985; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
986; SSE3-NEXT:    retq
987;
988; SSSE3-LABEL: shuffle_v4f32_0zz3:
989; SSSE3:       # %bb.0:
990; SSSE3-NEXT:    xorps %xmm1, %xmm1
991; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
992; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
993; SSSE3-NEXT:    retq
994;
995; SSE41-LABEL: shuffle_v4f32_0zz3:
996; SSE41:       # %bb.0:
997; SSE41-NEXT:    xorps %xmm1, %xmm1
998; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
999; SSE41-NEXT:    retq
1000;
1001; AVX-LABEL: shuffle_v4f32_0zz3:
1002; AVX:       # %bb.0:
1003; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1004; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1005; AVX-NEXT:    retq
1006  %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3>
1007  ret <4 x float> %shuffle
1008}
1009
1010define <4 x float> @shuffle_v4f32_0z2z(<4 x float> %v) {
1011; SSE2-LABEL: shuffle_v4f32_0z2z:
1012; SSE2:       # %bb.0:
1013; SSE2-NEXT:    xorps %xmm1, %xmm1
1014; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
1015; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
1016; SSE2-NEXT:    retq
1017;
1018; SSE3-LABEL: shuffle_v4f32_0z2z:
1019; SSE3:       # %bb.0:
1020; SSE3-NEXT:    xorps %xmm1, %xmm1
1021; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
1022; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
1023; SSE3-NEXT:    retq
1024;
1025; SSSE3-LABEL: shuffle_v4f32_0z2z:
1026; SSSE3:       # %bb.0:
1027; SSSE3-NEXT:    xorps %xmm1, %xmm1
1028; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
1029; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
1030; SSSE3-NEXT:    retq
1031;
1032; SSE41-LABEL: shuffle_v4f32_0z2z:
1033; SSE41:       # %bb.0:
1034; SSE41-NEXT:    xorps %xmm1, %xmm1
1035; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1036; SSE41-NEXT:    retq
1037;
1038; AVX-LABEL: shuffle_v4f32_0z2z:
1039; AVX:       # %bb.0:
1040; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1041; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1042; AVX-NEXT:    retq
1043  %shuffle = shufflevector <4 x float> %v, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 2, i32 4>
1044  ret <4 x float> %shuffle
1045}
1046
1047define <4 x float> @shuffle_v4f32_u051(<4 x float> %a, <4 x float> %b) {
1048; SSE-LABEL: shuffle_v4f32_u051:
1049; SSE:       # %bb.0:
1050; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1051; SSE-NEXT:    movaps %xmm1, %xmm0
1052; SSE-NEXT:    retq
1053;
1054; AVX-LABEL: shuffle_v4f32_u051:
1055; AVX:       # %bb.0:
1056; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1057; AVX-NEXT:    retq
1058  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 undef, i32 0, i32 5, i32 1>
1059  ret <4 x float> %shuffle
1060}
1061
1062define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %a, <4 x float> %b) {
1063; SSE2-LABEL: shuffle_v4f32_0zz4:
1064; SSE2:       # %bb.0:
1065; SSE2-NEXT:    movq {{.*#+}} xmm2 = xmm1[0],zero
1066; SSE2-NEXT:    pxor %xmm1, %xmm1
1067; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
1068; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1069; SSE2-NEXT:    movaps %xmm1, %xmm0
1070; SSE2-NEXT:    retq
1071;
1072; SSE3-LABEL: shuffle_v4f32_0zz4:
1073; SSE3:       # %bb.0:
1074; SSE3-NEXT:    movq {{.*#+}} xmm2 = xmm1[0],zero
1075; SSE3-NEXT:    pxor %xmm1, %xmm1
1076; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
1077; SSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1078; SSE3-NEXT:    movaps %xmm1, %xmm0
1079; SSE3-NEXT:    retq
1080;
1081; SSSE3-LABEL: shuffle_v4f32_0zz4:
1082; SSSE3:       # %bb.0:
1083; SSSE3-NEXT:    movq {{.*#+}} xmm2 = xmm1[0],zero
1084; SSSE3-NEXT:    pxor %xmm1, %xmm1
1085; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
1086; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1087; SSSE3-NEXT:    movaps %xmm1, %xmm0
1088; SSSE3-NEXT:    retq
1089;
1090; SSE41-LABEL: shuffle_v4f32_0zz4:
1091; SSE41:       # %bb.0:
1092; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
1093; SSE41-NEXT:    retq
1094;
1095; AVX-LABEL: shuffle_v4f32_0zz4:
1096; AVX:       # %bb.0:
1097; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
1098; AVX-NEXT:    retq
1099  %shuffle = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <4 x i32> <i32 undef, i32 5, i32 6, i32 0>
1100  %shuffle1 = shufflevector <4 x float> %a, <4 x float> %shuffle, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1101  ret <4 x float> %shuffle1
1102}
1103
1104define <4 x float> @shuffle_v4f32_0zz6(<4 x float> %a, <4 x float> %b) {
1105; SSE2-LABEL: shuffle_v4f32_0zz6:
1106; SSE2:       # %bb.0:
1107; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
1108; SSE2-NEXT:    xorps %xmm1, %xmm1
1109; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
1110; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1111; SSE2-NEXT:    movaps %xmm1, %xmm0
1112; SSE2-NEXT:    retq
1113;
1114; SSE3-LABEL: shuffle_v4f32_0zz6:
1115; SSE3:       # %bb.0:
1116; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
1117; SSE3-NEXT:    xorps %xmm1, %xmm1
1118; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
1119; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1120; SSE3-NEXT:    movaps %xmm1, %xmm0
1121; SSE3-NEXT:    retq
1122;
1123; SSSE3-LABEL: shuffle_v4f32_0zz6:
1124; SSSE3:       # %bb.0:
1125; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
1126; SSSE3-NEXT:    xorps %xmm1, %xmm1
1127; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
1128; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1129; SSSE3-NEXT:    movaps %xmm1, %xmm0
1130; SSSE3-NEXT:    retq
1131;
1132; SSE41-LABEL: shuffle_v4f32_0zz6:
1133; SSE41:       # %bb.0:
1134; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
1135; SSE41-NEXT:    retq
1136;
1137; AVX-LABEL: shuffle_v4f32_0zz6:
1138; AVX:       # %bb.0:
1139; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
1140; AVX-NEXT:    retq
1141  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 6>
1142  %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 2, i32 7>
1143  ret <4 x float> %shuffle1
1144}
1145
1146define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) {
1147; SSE2-LABEL: shuffle_v4f32_0z24:
1148; SSE2:       # %bb.0:
1149; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
1150; SSE2-NEXT:    xorps %xmm2, %xmm2
1151; SSE2-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1152; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
1153; SSE2-NEXT:    movaps %xmm2, %xmm0
1154; SSE2-NEXT:    retq
1155;
1156; SSE3-LABEL: shuffle_v4f32_0z24:
1157; SSE3:       # %bb.0:
1158; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
1159; SSE3-NEXT:    xorps %xmm2, %xmm2
1160; SSE3-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1161; SSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
1162; SSE3-NEXT:    movaps %xmm2, %xmm0
1163; SSE3-NEXT:    retq
1164;
1165; SSSE3-LABEL: shuffle_v4f32_0z24:
1166; SSSE3:       # %bb.0:
1167; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
1168; SSSE3-NEXT:    xorps %xmm2, %xmm2
1169; SSSE3-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1170; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
1171; SSSE3-NEXT:    movaps %xmm2, %xmm0
1172; SSSE3-NEXT:    retq
1173;
1174; SSE41-LABEL: shuffle_v4f32_0z24:
1175; SSE41:       # %bb.0:
1176; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
1177; SSE41-NEXT:    retq
1178;
1179; AVX-LABEL: shuffle_v4f32_0z24:
1180; AVX:       # %bb.0:
1181; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
1182; AVX-NEXT:    retq
1183  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 2, i32 4>
1184  %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1185  ret <4 x float> %shuffle1
1186}
1187
1188define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) {
1189; SSE2-LABEL: shuffle_v4i32_4zzz:
1190; SSE2:       # %bb.0:
1191; SSE2-NEXT:    xorps %xmm1, %xmm1
1192; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1193; SSE2-NEXT:    movaps %xmm1, %xmm0
1194; SSE2-NEXT:    retq
1195;
1196; SSE3-LABEL: shuffle_v4i32_4zzz:
1197; SSE3:       # %bb.0:
1198; SSE3-NEXT:    xorps %xmm1, %xmm1
1199; SSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1200; SSE3-NEXT:    movaps %xmm1, %xmm0
1201; SSE3-NEXT:    retq
1202;
1203; SSSE3-LABEL: shuffle_v4i32_4zzz:
1204; SSSE3:       # %bb.0:
1205; SSSE3-NEXT:    xorps %xmm1, %xmm1
1206; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1207; SSSE3-NEXT:    movaps %xmm1, %xmm0
1208; SSSE3-NEXT:    retq
1209;
1210; SSE41-LABEL: shuffle_v4i32_4zzz:
1211; SSE41:       # %bb.0:
1212; SSE41-NEXT:    xorps %xmm1, %xmm1
1213; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1214; SSE41-NEXT:    retq
1215;
1216; AVX-LABEL: shuffle_v4i32_4zzz:
1217; AVX:       # %bb.0:
1218; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1219; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1220; AVX-NEXT:    retq
1221  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1222  ret <4 x i32> %shuffle
1223}
1224
1225define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) {
1226; SSE2-LABEL: shuffle_v4i32_z4zz:
1227; SSE2:       # %bb.0:
1228; SSE2-NEXT:    xorps %xmm1, %xmm1
1229; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1230; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1231; SSE2-NEXT:    retq
1232;
1233; SSE3-LABEL: shuffle_v4i32_z4zz:
1234; SSE3:       # %bb.0:
1235; SSE3-NEXT:    xorps %xmm1, %xmm1
1236; SSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1237; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1238; SSE3-NEXT:    retq
1239;
1240; SSSE3-LABEL: shuffle_v4i32_z4zz:
1241; SSSE3:       # %bb.0:
1242; SSSE3-NEXT:    xorps %xmm1, %xmm1
1243; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1244; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1245; SSSE3-NEXT:    retq
1246;
1247; SSE41-LABEL: shuffle_v4i32_z4zz:
1248; SSE41:       # %bb.0:
1249; SSE41-NEXT:    pxor %xmm1, %xmm1
1250; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1251; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1252; SSE41-NEXT:    retq
1253;
1254; AVX1-LABEL: shuffle_v4i32_z4zz:
1255; AVX1:       # %bb.0:
1256; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1257; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1258; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1]
1259; AVX1-NEXT:    retq
1260;
1261; AVX2-SLOW-LABEL: shuffle_v4i32_z4zz:
1262; AVX2-SLOW:       # %bb.0:
1263; AVX2-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1264; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1265; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1]
1266; AVX2-SLOW-NEXT:    retq
1267;
1268; AVX2-FAST-LABEL: shuffle_v4i32_z4zz:
1269; AVX2-FAST:       # %bb.0:
1270; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
1271; AVX2-FAST-NEXT:    retq
1272;
1273; AVX512VL-LABEL: shuffle_v4i32_z4zz:
1274; AVX512VL:       # %bb.0:
1275; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
1276; AVX512VL-NEXT:    retq
1277  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
1278  ret <4 x i32> %shuffle
1279}
1280
1281define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) {
1282; SSE2-LABEL: shuffle_v4i32_zz4z:
1283; SSE2:       # %bb.0:
1284; SSE2-NEXT:    xorps %xmm1, %xmm1
1285; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1286; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1287; SSE2-NEXT:    retq
1288;
1289; SSE3-LABEL: shuffle_v4i32_zz4z:
1290; SSE3:       # %bb.0:
1291; SSE3-NEXT:    xorps %xmm1, %xmm1
1292; SSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1293; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1294; SSE3-NEXT:    retq
1295;
1296; SSSE3-LABEL: shuffle_v4i32_zz4z:
1297; SSSE3:       # %bb.0:
1298; SSSE3-NEXT:    xorps %xmm1, %xmm1
1299; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1300; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1301; SSSE3-NEXT:    retq
1302;
1303; SSE41-LABEL: shuffle_v4i32_zz4z:
1304; SSE41:       # %bb.0:
1305; SSE41-NEXT:    pxor %xmm1, %xmm1
1306; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1307; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1308; SSE41-NEXT:    retq
1309;
1310; AVX1-LABEL: shuffle_v4i32_zz4z:
1311; AVX1:       # %bb.0:
1312; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1313; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1314; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1]
1315; AVX1-NEXT:    retq
1316;
1317; AVX2-SLOW-LABEL: shuffle_v4i32_zz4z:
1318; AVX2-SLOW:       # %bb.0:
1319; AVX2-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1320; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1321; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1]
1322; AVX2-SLOW-NEXT:    retq
1323;
1324; AVX2-FAST-LABEL: shuffle_v4i32_zz4z:
1325; AVX2-FAST:       # %bb.0:
1326; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
1327; AVX2-FAST-NEXT:    retq
1328;
1329; AVX512VL-LABEL: shuffle_v4i32_zz4z:
1330; AVX512VL:       # %bb.0:
1331; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
1332; AVX512VL-NEXT:    retq
1333  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
1334  ret <4 x i32> %shuffle
1335}
1336
1337define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) {
1338; SSE-LABEL: shuffle_v4i32_zuu4:
1339; SSE:       # %bb.0:
1340; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
1341; SSE-NEXT:    retq
1342;
1343; AVX-LABEL: shuffle_v4i32_zuu4:
1344; AVX:       # %bb.0:
1345; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
1346; AVX-NEXT:    retq
1347  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
1348  ret <4 x i32> %shuffle
1349}
1350
1351define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) {
1352; SSE2-LABEL: shuffle_v4i32_z6zz:
1353; SSE2:       # %bb.0:
1354; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
1355; SSE2-NEXT:    xorps %xmm1, %xmm1
1356; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1357; SSE2-NEXT:    retq
1358;
1359; SSE3-LABEL: shuffle_v4i32_z6zz:
1360; SSE3:       # %bb.0:
1361; SSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
1362; SSE3-NEXT:    xorps %xmm1, %xmm1
1363; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1364; SSE3-NEXT:    retq
1365;
1366; SSSE3-LABEL: shuffle_v4i32_z6zz:
1367; SSSE3:       # %bb.0:
1368; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
1369; SSSE3-NEXT:    xorps %xmm1, %xmm1
1370; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1371; SSSE3-NEXT:    retq
1372;
1373; SSE41-LABEL: shuffle_v4i32_z6zz:
1374; SSE41:       # %bb.0:
1375; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
1376; SSE41-NEXT:    pxor %xmm0, %xmm0
1377; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1378; SSE41-NEXT:    retq
1379;
1380; AVX1-LABEL: shuffle_v4i32_z6zz:
1381; AVX1:       # %bb.0:
1382; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
1383; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1384; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1385; AVX1-NEXT:    retq
1386;
1387; AVX2-SLOW-LABEL: shuffle_v4i32_z6zz:
1388; AVX2-SLOW:       # %bb.0:
1389; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
1390; AVX2-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1391; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1392; AVX2-SLOW-NEXT:    retq
1393;
1394; AVX2-FAST-LABEL: shuffle_v4i32_z6zz:
1395; AVX2-FAST:       # %bb.0:
1396; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero
1397; AVX2-FAST-NEXT:    retq
1398;
1399; AVX512VL-LABEL: shuffle_v4i32_z6zz:
1400; AVX512VL:       # %bb.0:
1401; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero
1402; AVX512VL-NEXT:    retq
1403  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
1404  ret <4 x i32> %shuffle
1405}
1406
1407define <4 x i32> @shuffle_v4i32_7012(<4 x i32> %a, <4 x i32> %b) {
1408; SSE2-LABEL: shuffle_v4i32_7012:
1409; SSE2:       # %bb.0:
1410; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
1411; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
1412; SSE2-NEXT:    movaps %xmm1, %xmm0
1413; SSE2-NEXT:    retq
1414;
1415; SSE3-LABEL: shuffle_v4i32_7012:
1416; SSE3:       # %bb.0:
1417; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
1418; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
1419; SSE3-NEXT:    movaps %xmm1, %xmm0
1420; SSE3-NEXT:    retq
1421;
1422; SSSE3-LABEL: shuffle_v4i32_7012:
1423; SSSE3:       # %bb.0:
1424; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
1425; SSSE3-NEXT:    retq
1426;
1427; SSE41-LABEL: shuffle_v4i32_7012:
1428; SSE41:       # %bb.0:
1429; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
1430; SSE41-NEXT:    retq
1431;
1432; AVX-LABEL: shuffle_v4i32_7012:
1433; AVX:       # %bb.0:
1434; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
1435; AVX-NEXT:    retq
1436  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
1437  ret <4 x i32> %shuffle
1438}
1439
1440define <4 x i32> @shuffle_v4i32_6701(<4 x i32> %a, <4 x i32> %b) {
1441; SSE2-LABEL: shuffle_v4i32_6701:
1442; SSE2:       # %bb.0:
1443; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
1444; SSE2-NEXT:    movaps %xmm1, %xmm0
1445; SSE2-NEXT:    retq
1446;
1447; SSE3-LABEL: shuffle_v4i32_6701:
1448; SSE3:       # %bb.0:
1449; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
1450; SSE3-NEXT:    movaps %xmm1, %xmm0
1451; SSE3-NEXT:    retq
1452;
1453; SSSE3-LABEL: shuffle_v4i32_6701:
1454; SSSE3:       # %bb.0:
1455; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1456; SSSE3-NEXT:    retq
1457;
1458; SSE41-LABEL: shuffle_v4i32_6701:
1459; SSE41:       # %bb.0:
1460; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1461; SSE41-NEXT:    retq
1462;
1463; AVX-LABEL: shuffle_v4i32_6701:
1464; AVX:       # %bb.0:
1465; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1466; AVX-NEXT:    retq
1467  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1468  ret <4 x i32> %shuffle
1469}
1470
1471define <4 x i32> @shuffle_v4i32_5670(<4 x i32> %a, <4 x i32> %b) {
1472; SSE2-LABEL: shuffle_v4i32_5670:
1473; SSE2:       # %bb.0:
1474; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1475; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0]
1476; SSE2-NEXT:    movaps %xmm1, %xmm0
1477; SSE2-NEXT:    retq
1478;
1479; SSE3-LABEL: shuffle_v4i32_5670:
1480; SSE3:       # %bb.0:
1481; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1482; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0]
1483; SSE3-NEXT:    movaps %xmm1, %xmm0
1484; SSE3-NEXT:    retq
1485;
1486; SSSE3-LABEL: shuffle_v4i32_5670:
1487; SSSE3:       # %bb.0:
1488; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
1489; SSSE3-NEXT:    retq
1490;
1491; SSE41-LABEL: shuffle_v4i32_5670:
1492; SSE41:       # %bb.0:
1493; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
1494; SSE41-NEXT:    retq
1495;
1496; AVX-LABEL: shuffle_v4i32_5670:
1497; AVX:       # %bb.0:
1498; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
1499; AVX-NEXT:    retq
1500  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 5, i32 6, i32 7, i32 0>
1501  ret <4 x i32> %shuffle
1502}
1503
1504define <4 x i32> @shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b) {
1505; SSE2-LABEL: shuffle_v4i32_1234:
1506; SSE2:       # %bb.0:
1507; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
1508; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
1509; SSE2-NEXT:    retq
1510;
1511; SSE3-LABEL: shuffle_v4i32_1234:
1512; SSE3:       # %bb.0:
1513; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
1514; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
1515; SSE3-NEXT:    retq
1516;
1517; SSSE3-LABEL: shuffle_v4i32_1234:
1518; SSSE3:       # %bb.0:
1519; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
1520; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1521; SSSE3-NEXT:    retq
1522;
1523; SSE41-LABEL: shuffle_v4i32_1234:
1524; SSE41:       # %bb.0:
1525; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
1526; SSE41-NEXT:    movdqa %xmm1, %xmm0
1527; SSE41-NEXT:    retq
1528;
1529; AVX-LABEL: shuffle_v4i32_1234:
1530; AVX:       # %bb.0:
1531; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
1532; AVX-NEXT:    retq
1533  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
1534  ret <4 x i32> %shuffle
1535}
1536
1537define <4 x i32> @shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b) {
1538; SSE2-LABEL: shuffle_v4i32_2345:
1539; SSE2:       # %bb.0:
1540; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1541; SSE2-NEXT:    retq
1542;
1543; SSE3-LABEL: shuffle_v4i32_2345:
1544; SSE3:       # %bb.0:
1545; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1546; SSE3-NEXT:    retq
1547;
1548; SSSE3-LABEL: shuffle_v4i32_2345:
1549; SSSE3:       # %bb.0:
1550; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
1551; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1552; SSSE3-NEXT:    retq
1553;
1554; SSE41-LABEL: shuffle_v4i32_2345:
1555; SSE41:       # %bb.0:
1556; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
1557; SSE41-NEXT:    movdqa %xmm1, %xmm0
1558; SSE41-NEXT:    retq
1559;
1560; AVX-LABEL: shuffle_v4i32_2345:
1561; AVX:       # %bb.0:
1562; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
1563; AVX-NEXT:    retq
1564  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
1565  ret <4 x i32> %shuffle
1566}
1567
1568; PR22391
1569define <4 x i32> @shuffle_v4i32_2456(<4 x i32> %a, <4 x i32> %b) {
1570; SSE2-LABEL: shuffle_v4i32_2456:
1571; SSE2:       # %bb.0:
1572; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1573; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1574; SSE2-NEXT:    retq
1575;
1576; SSE3-LABEL: shuffle_v4i32_2456:
1577; SSE3:       # %bb.0:
1578; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1579; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1580; SSE3-NEXT:    retq
1581;
1582; SSSE3-LABEL: shuffle_v4i32_2456:
1583; SSSE3:       # %bb.0:
1584; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1585; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1586; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1587; SSSE3-NEXT:    retq
1588;
1589; SSE41-LABEL: shuffle_v4i32_2456:
1590; SSE41:       # %bb.0:
1591; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1592; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1593; SSE41-NEXT:    movdqa %xmm1, %xmm0
1594; SSE41-NEXT:    retq
1595;
1596; AVX1OR2-LABEL: shuffle_v4i32_2456:
1597; AVX1OR2:       # %bb.0:
1598; AVX1OR2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1599; AVX1OR2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1600; AVX1OR2-NEXT:    retq
1601;
1602; AVX512VL-LABEL: shuffle_v4i32_2456:
1603; AVX512VL:       # %bb.0:
1604; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,0,1,2]
1605; AVX512VL-NEXT:    vpermi2d %xmm0, %xmm1, %xmm2
1606; AVX512VL-NEXT:    vmovdqa %xmm2, %xmm0
1607; AVX512VL-NEXT:    retq
1608  %s1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
1609  %s2 = shufflevector <4 x i32> %s1, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1610  ret <4 x i32> %s2
1611}
1612
1613define <4 x i32> @shuffle_v4i32_40u1(<4 x i32> %a, <4 x i32> %b) {
1614; SSE-LABEL: shuffle_v4i32_40u1:
1615; SSE:       # %bb.0:
1616; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1617; SSE-NEXT:    movaps %xmm1, %xmm0
1618; SSE-NEXT:    retq
1619;
1620; AVX-LABEL: shuffle_v4i32_40u1:
1621; AVX:       # %bb.0:
1622; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1623; AVX-NEXT:    retq
1624  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 undef, i32 1>
1625  ret <4 x i32> %shuffle
1626}
1627
1628define <4 x i32> @shuffle_v4i32_3456(<4 x i32> %a, <4 x i32> %b) {
1629; SSE2-LABEL: shuffle_v4i32_3456:
1630; SSE2:       # %bb.0:
1631; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
1632; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1633; SSE2-NEXT:    retq
1634;
1635; SSE3-LABEL: shuffle_v4i32_3456:
1636; SSE3:       # %bb.0:
1637; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
1638; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1639; SSE3-NEXT:    retq
1640;
1641; SSSE3-LABEL: shuffle_v4i32_3456:
1642; SSSE3:       # %bb.0:
1643; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1644; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1645; SSSE3-NEXT:    retq
1646;
1647; SSE41-LABEL: shuffle_v4i32_3456:
1648; SSE41:       # %bb.0:
1649; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1650; SSE41-NEXT:    movdqa %xmm1, %xmm0
1651; SSE41-NEXT:    retq
1652;
1653; AVX-LABEL: shuffle_v4i32_3456:
1654; AVX:       # %bb.0:
1655; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1656; AVX-NEXT:    retq
1657  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1658  ret <4 x i32> %shuffle
1659}
1660
1661define <4 x i32> @shuffle_v4i32_0u1u(<4 x i32> %a, <4 x i32> %b) {
1662; SSE2-LABEL: shuffle_v4i32_0u1u:
1663; SSE2:       # %bb.0:
1664; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1665; SSE2-NEXT:    retq
1666;
1667; SSE3-LABEL: shuffle_v4i32_0u1u:
1668; SSE3:       # %bb.0:
1669; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1670; SSE3-NEXT:    retq
1671;
1672; SSSE3-LABEL: shuffle_v4i32_0u1u:
1673; SSSE3:       # %bb.0:
1674; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1675; SSSE3-NEXT:    retq
1676;
1677; SSE41-LABEL: shuffle_v4i32_0u1u:
1678; SSE41:       # %bb.0:
1679; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1680; SSE41-NEXT:    retq
1681;
1682; AVX-LABEL: shuffle_v4i32_0u1u:
1683; AVX:       # %bb.0:
1684; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1685; AVX-NEXT:    retq
1686  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
1687  ret <4 x i32> %shuffle
1688}
1689
1690define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) {
1691; SSE2-LABEL: shuffle_v4i32_0z1z:
1692; SSE2:       # %bb.0:
1693; SSE2-NEXT:    xorps %xmm1, %xmm1
1694; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1695; SSE2-NEXT:    retq
1696;
1697; SSE3-LABEL: shuffle_v4i32_0z1z:
1698; SSE3:       # %bb.0:
1699; SSE3-NEXT:    xorps %xmm1, %xmm1
1700; SSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1701; SSE3-NEXT:    retq
1702;
1703; SSSE3-LABEL: shuffle_v4i32_0z1z:
1704; SSSE3:       # %bb.0:
1705; SSSE3-NEXT:    xorps %xmm1, %xmm1
1706; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1707; SSSE3-NEXT:    retq
1708;
1709; SSE41-LABEL: shuffle_v4i32_0z1z:
1710; SSE41:       # %bb.0:
1711; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1712; SSE41-NEXT:    retq
1713;
1714; AVX-LABEL: shuffle_v4i32_0z1z:
1715; AVX:       # %bb.0:
1716; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1717; AVX-NEXT:    retq
1718  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1719  ret <4 x i32> %shuffle
1720}
1721
1722define <4 x i32> @shuffle_v4i32_01zu(<4 x i32> %a) {
1723; SSE-LABEL: shuffle_v4i32_01zu:
1724; SSE:       # %bb.0:
1725; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
1726; SSE-NEXT:    retq
1727;
1728; AVX-LABEL: shuffle_v4i32_01zu:
1729; AVX:       # %bb.0:
1730; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
1731; AVX-NEXT:    retq
1732  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 7, i32 undef>
1733  ret <4 x i32> %shuffle
1734}
1735
1736define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) {
1737; SSE2-LABEL: shuffle_v4i32_0z23:
1738; SSE2:       # %bb.0:
1739; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
1740; SSE2-NEXT:    retq
1741;
1742; SSE3-LABEL: shuffle_v4i32_0z23:
1743; SSE3:       # %bb.0:
1744; SSE3-NEXT:    andps {{.*}}(%rip), %xmm0
1745; SSE3-NEXT:    retq
1746;
1747; SSSE3-LABEL: shuffle_v4i32_0z23:
1748; SSSE3:       # %bb.0:
1749; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
1750; SSSE3-NEXT:    retq
1751;
1752; SSE41-LABEL: shuffle_v4i32_0z23:
1753; SSE41:       # %bb.0:
1754; SSE41-NEXT:    xorps %xmm1, %xmm1
1755; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1756; SSE41-NEXT:    retq
1757;
1758; AVX-LABEL: shuffle_v4i32_0z23:
1759; AVX:       # %bb.0:
1760; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1761; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1762; AVX-NEXT:    retq
1763  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
1764  ret <4 x i32> %shuffle
1765}
1766
1767define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) {
1768; SSE2-LABEL: shuffle_v4i32_01z3:
1769; SSE2:       # %bb.0:
1770; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
1771; SSE2-NEXT:    retq
1772;
1773; SSE3-LABEL: shuffle_v4i32_01z3:
1774; SSE3:       # %bb.0:
1775; SSE3-NEXT:    andps {{.*}}(%rip), %xmm0
1776; SSE3-NEXT:    retq
1777;
1778; SSSE3-LABEL: shuffle_v4i32_01z3:
1779; SSSE3:       # %bb.0:
1780; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
1781; SSSE3-NEXT:    retq
1782;
1783; SSE41-LABEL: shuffle_v4i32_01z3:
1784; SSE41:       # %bb.0:
1785; SSE41-NEXT:    xorps %xmm1, %xmm1
1786; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
1787; SSE41-NEXT:    retq
1788;
1789; AVX-LABEL: shuffle_v4i32_01z3:
1790; AVX:       # %bb.0:
1791; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1792; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
1793; AVX-NEXT:    retq
1794  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
1795  ret <4 x i32> %shuffle
1796}
1797
1798define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) {
1799; SSE2-LABEL: shuffle_v4i32_012z:
1800; SSE2:       # %bb.0:
1801; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
1802; SSE2-NEXT:    retq
1803;
1804; SSE3-LABEL: shuffle_v4i32_012z:
1805; SSE3:       # %bb.0:
1806; SSE3-NEXT:    andps {{.*}}(%rip), %xmm0
1807; SSE3-NEXT:    retq
1808;
1809; SSSE3-LABEL: shuffle_v4i32_012z:
1810; SSSE3:       # %bb.0:
1811; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
1812; SSSE3-NEXT:    retq
1813;
1814; SSE41-LABEL: shuffle_v4i32_012z:
1815; SSE41:       # %bb.0:
1816; SSE41-NEXT:    xorps %xmm1, %xmm1
1817; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1818; SSE41-NEXT:    retq
1819;
1820; AVX-LABEL: shuffle_v4i32_012z:
1821; AVX:       # %bb.0:
1822; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1823; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1824; AVX-NEXT:    retq
1825  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1826  ret <4 x i32> %shuffle
1827}
1828
1829define <4 x i32> @shuffle_v4i32_0zz3(<4 x i32> %a) {
1830; SSE2-LABEL: shuffle_v4i32_0zz3:
1831; SSE2:       # %bb.0:
1832; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
1833; SSE2-NEXT:    retq
1834;
1835; SSE3-LABEL: shuffle_v4i32_0zz3:
1836; SSE3:       # %bb.0:
1837; SSE3-NEXT:    andps {{.*}}(%rip), %xmm0
1838; SSE3-NEXT:    retq
1839;
1840; SSSE3-LABEL: shuffle_v4i32_0zz3:
1841; SSSE3:       # %bb.0:
1842; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
1843; SSSE3-NEXT:    retq
1844;
1845; SSE41-LABEL: shuffle_v4i32_0zz3:
1846; SSE41:       # %bb.0:
1847; SSE41-NEXT:    xorps %xmm1, %xmm1
1848; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1849; SSE41-NEXT:    retq
1850;
1851; AVX-LABEL: shuffle_v4i32_0zz3:
1852; AVX:       # %bb.0:
1853; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1854; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1855; AVX-NEXT:    retq
1856  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3>
1857  ret <4 x i32> %shuffle
1858}
1859
1860define <4 x i32> @shuffle_v4i32_bitcast_0415(<4 x i32> %a, <4 x i32> %b) {
1861; SSE-LABEL: shuffle_v4i32_bitcast_0415:
1862; SSE:       # %bb.0:
1863; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1864; SSE-NEXT:    retq
1865;
1866; AVX-LABEL: shuffle_v4i32_bitcast_0415:
1867; AVX:       # %bb.0:
1868; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1869; AVX-NEXT:    retq
1870  %shuffle32 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 0, i32 4>
1871  %bitcast64 = bitcast <4 x i32> %shuffle32 to <2 x double>
1872  %shuffle64 = shufflevector <2 x double> %bitcast64, <2 x double> undef, <2 x i32> <i32 1, i32 0>
1873  %bitcast32 = bitcast <2 x double> %shuffle64 to <4 x i32>
1874  ret <4 x i32> %bitcast32
1875}
1876
1877define <4 x float> @shuffle_v4f32_bitcast_4401(<4 x float> %a, <4 x i32> %b) {
1878; SSE-LABEL: shuffle_v4f32_bitcast_4401:
1879; SSE:       # %bb.0:
1880; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1]
1881; SSE-NEXT:    movaps %xmm1, %xmm0
1882; SSE-NEXT:    retq
1883;
1884; AVX-LABEL: shuffle_v4f32_bitcast_4401:
1885; AVX:       # %bb.0:
1886; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,1]
1887; AVX-NEXT:    retq
1888  %1 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
1889  %2 = bitcast <4 x i32> %1 to <2 x double>
1890  %3 = bitcast <4 x float> %a to <2 x double>
1891  %4 = shufflevector <2 x double> %2, <2 x double> %3, <2 x i32> <i32 0, i32 2>
1892  %5 = bitcast <2 x double> %4 to <4 x float>
1893  ret <4 x float> %5
1894}
1895
1896define <4 x float> @shuffle_v4f32_bitcast_0045(<4 x float> %a, <4 x i32> %b) {
1897; SSE-LABEL: shuffle_v4f32_bitcast_0045:
1898; SSE:       # %bb.0:
1899; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
1900; SSE-NEXT:    retq
1901;
1902; AVX-LABEL: shuffle_v4f32_bitcast_0045:
1903; AVX:       # %bb.0:
1904; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
1905; AVX-NEXT:    retq
1906  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
1907  %2 = bitcast <4 x i32> %b to <4 x float>
1908  %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 1, i32 0, i32 4, i32 5>
1909  ret <4 x float> %3
1910}
1911
1912define <4 x float> @mask_v4f32_4127(<4 x float> %a, <4 x float> %b) {
1913; SSE2-LABEL: mask_v4f32_4127:
1914; SSE2:       # %bb.0:
1915; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
1916; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1917; SSE2-NEXT:    movaps %xmm1, %xmm0
1918; SSE2-NEXT:    retq
1919;
1920; SSE3-LABEL: mask_v4f32_4127:
1921; SSE3:       # %bb.0:
1922; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
1923; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1924; SSE3-NEXT:    movaps %xmm1, %xmm0
1925; SSE3-NEXT:    retq
1926;
1927; SSSE3-LABEL: mask_v4f32_4127:
1928; SSSE3:       # %bb.0:
1929; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
1930; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1931; SSSE3-NEXT:    movaps %xmm1, %xmm0
1932; SSSE3-NEXT:    retq
1933;
1934; SSE41-LABEL: mask_v4f32_4127:
1935; SSE41:       # %bb.0:
1936; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
1937; SSE41-NEXT:    retq
1938;
1939; AVX-LABEL: mask_v4f32_4127:
1940; AVX:       # %bb.0:
1941; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
1942; AVX-NEXT:    retq
1943  %1 = bitcast <4 x float> %a to <4 x i32>
1944  %2 = bitcast <4 x float> %b to <4 x i32>
1945  %3 = and <4 x i32> %1, <i32  0, i32 -1, i32 -1, i32  0>
1946  %4 = and <4 x i32> %2, <i32 -1, i32  0, i32  0, i32 -1>
1947  %5 = or <4 x i32> %4, %3
1948  %6 = bitcast <4 x i32> %5 to <4 x float>
1949  ret <4 x float> %6
1950}
1951
1952define <4 x float> @mask_v4f32_0127(<4 x float> %a, <4 x float> %b) {
1953; SSE2-LABEL: mask_v4f32_0127:
1954; SSE2:       # %bb.0:
1955; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1956; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1957; SSE2-NEXT:    movaps %xmm1, %xmm0
1958; SSE2-NEXT:    retq
1959;
1960; SSE3-LABEL: mask_v4f32_0127:
1961; SSE3:       # %bb.0:
1962; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1963; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1964; SSE3-NEXT:    movaps %xmm1, %xmm0
1965; SSE3-NEXT:    retq
1966;
1967; SSSE3-LABEL: mask_v4f32_0127:
1968; SSSE3:       # %bb.0:
1969; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1970; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1971; SSSE3-NEXT:    movaps %xmm1, %xmm0
1972; SSSE3-NEXT:    retq
1973;
1974; SSE41-LABEL: mask_v4f32_0127:
1975; SSE41:       # %bb.0:
1976; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1977; SSE41-NEXT:    retq
1978;
1979; AVX-LABEL: mask_v4f32_0127:
1980; AVX:       # %bb.0:
1981; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1982; AVX-NEXT:    retq
1983  %1 = bitcast <4 x float> %a to <2 x i64>
1984  %2 = bitcast <4 x float> %b to <2 x i64>
1985  %3 = and <2 x i64> %1, <i64 0, i64 -4294967296>
1986  %4 = and <2 x i64> %2, <i64 -1, i64 4294967295>
1987  %5 = or <2 x i64> %4, %3
1988  %6 = bitcast <2 x i64> %5 to <4 x float>
1989  ret <4 x float> %6
1990}
1991
1992define <4 x i32> @mask_v4i32_0127(<4 x i32> %a, <4 x i32> %b) {
1993; SSE2-LABEL: mask_v4i32_0127:
1994; SSE2:       # %bb.0:
1995; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1996; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1997; SSE2-NEXT:    movaps %xmm1, %xmm0
1998; SSE2-NEXT:    retq
1999;
2000; SSE3-LABEL: mask_v4i32_0127:
2001; SSE3:       # %bb.0:
2002; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
2003; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
2004; SSE3-NEXT:    movaps %xmm1, %xmm0
2005; SSE3-NEXT:    retq
2006;
2007; SSSE3-LABEL: mask_v4i32_0127:
2008; SSSE3:       # %bb.0:
2009; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
2010; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
2011; SSSE3-NEXT:    movaps %xmm1, %xmm0
2012; SSSE3-NEXT:    retq
2013;
2014; SSE41-LABEL: mask_v4i32_0127:
2015; SSE41:       # %bb.0:
2016; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
2017; SSE41-NEXT:    retq
2018;
2019; AVX-LABEL: mask_v4i32_0127:
2020; AVX:       # %bb.0:
2021; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
2022; AVX-NEXT:    retq
2023  %1 = bitcast <4 x i32> %a to <2 x i64>
2024  %2 = bitcast <4 x i32> %b to <2 x i64>
2025  %3 = and <2 x i64> %1, <i64 0, i64 -4294967296>
2026  %4 = and <2 x i64> %2, <i64 -1, i64 4294967295>
2027  %5 = or <2 x i64> %4, %3
2028  %6 = bitcast <2 x i64> %5 to <4 x i32>
2029  ret <4 x i32> %6
2030}
2031
2032define <4 x float> @broadcast_v4f32_0101_from_v2f32(<2 x float>* %x) {
2033; SSE2-LABEL: broadcast_v4f32_0101_from_v2f32:
2034; SSE2:       # %bb.0:
2035; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2036; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2037; SSE2-NEXT:    retq
2038;
2039; SSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
2040; SSE3:       # %bb.0:
2041; SSE3-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
2042; SSE3-NEXT:    retq
2043;
2044; SSSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
2045; SSSE3:       # %bb.0:
2046; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
2047; SSSE3-NEXT:    retq
2048;
2049; SSE41-LABEL: broadcast_v4f32_0101_from_v2f32:
2050; SSE41:       # %bb.0:
2051; SSE41-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
2052; SSE41-NEXT:    retq
2053;
2054; AVX-LABEL: broadcast_v4f32_0101_from_v2f32:
2055; AVX:       # %bb.0:
2056; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
2057; AVX-NEXT:    retq
2058  %1 = load <2 x float>, <2 x float>* %x, align 1
2059  %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
2060  ret <4 x float> %2
2061}
2062
2063define <4 x i32> @extract3_insert0_v4i32_7123(<4 x i32> %a0, <4 x i32> %a1) {
2064; SSE2-LABEL: extract3_insert0_v4i32_7123:
2065; SSE2:       # %bb.0:
2066; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
2067; SSE2-NEXT:    movd %xmm1, %eax
2068; SSE2-NEXT:    movd %eax, %xmm1
2069; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2070; SSE2-NEXT:    retq
2071;
2072; SSE3-LABEL: extract3_insert0_v4i32_7123:
2073; SSE3:       # %bb.0:
2074; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
2075; SSE3-NEXT:    movd %xmm1, %eax
2076; SSE3-NEXT:    movd %eax, %xmm1
2077; SSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2078; SSE3-NEXT:    retq
2079;
2080; SSSE3-LABEL: extract3_insert0_v4i32_7123:
2081; SSSE3:       # %bb.0:
2082; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
2083; SSSE3-NEXT:    movd %xmm1, %eax
2084; SSSE3-NEXT:    movd %eax, %xmm1
2085; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2086; SSSE3-NEXT:    retq
2087;
2088; SSE41-LABEL: extract3_insert0_v4i32_7123:
2089; SSE41:       # %bb.0:
2090; SSE41-NEXT:    extractps $3, %xmm1, %eax
2091; SSE41-NEXT:    pinsrd $0, %eax, %xmm0
2092; SSE41-NEXT:    retq
2093;
2094; AVX-LABEL: extract3_insert0_v4i32_7123:
2095; AVX:       # %bb.0:
2096; AVX-NEXT:    vextractps $3, %xmm1, %eax
2097; AVX-NEXT:    vpinsrd $0, %eax, %xmm0, %xmm0
2098; AVX-NEXT:    retq
2099  %1 = extractelement <4 x i32> %a1, i32 3
2100  %2 = insertelement <4 x i32> %a0, i32 %1, i32 0
2101  ret <4 x i32> %2
2102}
2103
2104define <4 x i32> @extract3_insert3_v4i32_0127(<4 x i32> %a0, <4 x i32> %a1) {
2105; SSE2-LABEL: extract3_insert3_v4i32_0127:
2106; SSE2:       # %bb.0:
2107; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2108; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2109; SSE2-NEXT:    retq
2110;
2111; SSE3-LABEL: extract3_insert3_v4i32_0127:
2112; SSE3:       # %bb.0:
2113; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2114; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2115; SSE3-NEXT:    retq
2116;
2117; SSSE3-LABEL: extract3_insert3_v4i32_0127:
2118; SSSE3:       # %bb.0:
2119; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2120; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2121; SSSE3-NEXT:    retq
2122;
2123; SSE41-LABEL: extract3_insert3_v4i32_0127:
2124; SSE41:       # %bb.0:
2125; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
2126; SSE41-NEXT:    retq
2127;
2128; AVX-LABEL: extract3_insert3_v4i32_0127:
2129; AVX:       # %bb.0:
2130; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
2131; AVX-NEXT:    retq
2132  %1 = extractelement <4 x i32> %a1, i32 3
2133  %2 = insertelement <4 x i32> %a0, i32 %1, i32 3
2134  ret <4 x i32> %2
2135}
2136
2137define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) {
2138; SSE-LABEL: insert_reg_and_zero_v4i32:
2139; SSE:       # %bb.0:
2140; SSE-NEXT:    movd %edi, %xmm0
2141; SSE-NEXT:    retq
2142;
2143; AVX-LABEL: insert_reg_and_zero_v4i32:
2144; AVX:       # %bb.0:
2145; AVX-NEXT:    vmovd %edi, %xmm0
2146; AVX-NEXT:    retq
2147  %v = insertelement <4 x i32> undef, i32 %a, i32 0
2148  %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2149  ret <4 x i32> %shuffle
2150}
2151
2152define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) {
2153; SSE-LABEL: insert_mem_and_zero_v4i32:
2154; SSE:       # %bb.0:
2155; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2156; SSE-NEXT:    retq
2157;
2158; AVX-LABEL: insert_mem_and_zero_v4i32:
2159; AVX:       # %bb.0:
2160; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2161; AVX-NEXT:    retq
2162  %a = load i32, i32* %ptr
2163  %v = insertelement <4 x i32> undef, i32 %a, i32 0
2164  %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2165  ret <4 x i32> %shuffle
2166}
2167
2168define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
2169; SSE2-LABEL: insert_reg_and_zero_v4f32:
2170; SSE2:       # %bb.0:
2171; SSE2-NEXT:    xorps %xmm1, %xmm1
2172; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2173; SSE2-NEXT:    movaps %xmm1, %xmm0
2174; SSE2-NEXT:    retq
2175;
2176; SSE3-LABEL: insert_reg_and_zero_v4f32:
2177; SSE3:       # %bb.0:
2178; SSE3-NEXT:    xorps %xmm1, %xmm1
2179; SSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2180; SSE3-NEXT:    movaps %xmm1, %xmm0
2181; SSE3-NEXT:    retq
2182;
2183; SSSE3-LABEL: insert_reg_and_zero_v4f32:
2184; SSSE3:       # %bb.0:
2185; SSSE3-NEXT:    xorps %xmm1, %xmm1
2186; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2187; SSSE3-NEXT:    movaps %xmm1, %xmm0
2188; SSSE3-NEXT:    retq
2189;
2190; SSE41-LABEL: insert_reg_and_zero_v4f32:
2191; SSE41:       # %bb.0:
2192; SSE41-NEXT:    xorps %xmm1, %xmm1
2193; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2194; SSE41-NEXT:    retq
2195;
2196; AVX-LABEL: insert_reg_and_zero_v4f32:
2197; AVX:       # %bb.0:
2198; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
2199; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2200; AVX-NEXT:    retq
2201  %v = insertelement <4 x float> undef, float %a, i32 0
2202  %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2203  ret <4 x float> %shuffle
2204}
2205
2206define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
2207; SSE-LABEL: insert_mem_and_zero_v4f32:
2208; SSE:       # %bb.0:
2209; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2210; SSE-NEXT:    retq
2211;
2212; AVX-LABEL: insert_mem_and_zero_v4f32:
2213; AVX:       # %bb.0:
2214; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2215; AVX-NEXT:    retq
2216  %a = load float, float* %ptr
2217  %v = insertelement <4 x float> undef, float %a, i32 0
2218  %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2219  ret <4 x float> %shuffle
2220}
2221
2222define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) {
2223; SSE2-LABEL: insert_reg_lo_v4i32:
2224; SSE2:       # %bb.0:
2225; SSE2-NEXT:    movq %rdi, %xmm1
2226; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2227; SSE2-NEXT:    retq
2228;
2229; SSE3-LABEL: insert_reg_lo_v4i32:
2230; SSE3:       # %bb.0:
2231; SSE3-NEXT:    movq %rdi, %xmm1
2232; SSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2233; SSE3-NEXT:    retq
2234;
2235; SSSE3-LABEL: insert_reg_lo_v4i32:
2236; SSSE3:       # %bb.0:
2237; SSSE3-NEXT:    movq %rdi, %xmm1
2238; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2239; SSSE3-NEXT:    retq
2240;
2241; SSE41-LABEL: insert_reg_lo_v4i32:
2242; SSE41:       # %bb.0:
2243; SSE41-NEXT:    movq %rdi, %xmm1
2244; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2245; SSE41-NEXT:    retq
2246;
2247; AVX1-LABEL: insert_reg_lo_v4i32:
2248; AVX1:       # %bb.0:
2249; AVX1-NEXT:    vmovq %rdi, %xmm1
2250; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2251; AVX1-NEXT:    retq
2252;
2253; AVX2OR512VL-LABEL: insert_reg_lo_v4i32:
2254; AVX2OR512VL:       # %bb.0:
2255; AVX2OR512VL-NEXT:    vmovq %rdi, %xmm1
2256; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2257; AVX2OR512VL-NEXT:    retq
2258  %a.cast = bitcast i64 %a to <2 x i32>
2259  %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2260  %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2261  ret <4 x i32> %shuffle
2262}
2263
2264define <4 x i32> @insert_mem_lo_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
2265; SSE2-LABEL: insert_mem_lo_v4i32:
2266; SSE2:       # %bb.0:
2267; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2268; SSE2-NEXT:    retq
2269;
2270; SSE3-LABEL: insert_mem_lo_v4i32:
2271; SSE3:       # %bb.0:
2272; SSE3-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2273; SSE3-NEXT:    retq
2274;
2275; SSSE3-LABEL: insert_mem_lo_v4i32:
2276; SSSE3:       # %bb.0:
2277; SSSE3-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2278; SSSE3-NEXT:    retq
2279;
2280; SSE41-LABEL: insert_mem_lo_v4i32:
2281; SSE41:       # %bb.0:
2282; SSE41-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
2283; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2284; SSE41-NEXT:    retq
2285;
2286; AVX-LABEL: insert_mem_lo_v4i32:
2287; AVX:       # %bb.0:
2288; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
2289; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2290; AVX-NEXT:    retq
2291  %a = load <2 x i32>, <2 x i32>* %ptr
2292  %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2293  %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2294  ret <4 x i32> %shuffle
2295}
2296
2297define <4 x i32> @insert_reg_hi_v4i32(i64 %a, <4 x i32> %b) {
2298; SSE-LABEL: insert_reg_hi_v4i32:
2299; SSE:       # %bb.0:
2300; SSE-NEXT:    movq %rdi, %xmm1
2301; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2302; SSE-NEXT:    retq
2303;
2304; AVX-LABEL: insert_reg_hi_v4i32:
2305; AVX:       # %bb.0:
2306; AVX-NEXT:    vmovq %rdi, %xmm1
2307; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2308; AVX-NEXT:    retq
2309  %a.cast = bitcast i64 %a to <2 x i32>
2310  %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2311  %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2312  ret <4 x i32> %shuffle
2313}
2314
2315define <4 x i32> @insert_mem_hi_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
2316; SSE-LABEL: insert_mem_hi_v4i32:
2317; SSE:       # %bb.0:
2318; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
2319; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2320; SSE-NEXT:    retq
2321;
2322; AVX-LABEL: insert_mem_hi_v4i32:
2323; AVX:       # %bb.0:
2324; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
2325; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2326; AVX-NEXT:    retq
2327  %a = load <2 x i32>, <2 x i32>* %ptr
2328  %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2329  %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2330  ret <4 x i32> %shuffle
2331}
2332
2333define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) {
2334; SSE2-LABEL: insert_reg_lo_v4f32:
2335; SSE2:       # %bb.0:
2336; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2337; SSE2-NEXT:    retq
2338;
2339; SSE3-LABEL: insert_reg_lo_v4f32:
2340; SSE3:       # %bb.0:
2341; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2342; SSE3-NEXT:    retq
2343;
2344; SSSE3-LABEL: insert_reg_lo_v4f32:
2345; SSSE3:       # %bb.0:
2346; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2347; SSSE3-NEXT:    retq
2348;
2349; SSE41-LABEL: insert_reg_lo_v4f32:
2350; SSE41:       # %bb.0:
2351; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2352; SSE41-NEXT:    retq
2353;
2354; AVX-LABEL: insert_reg_lo_v4f32:
2355; AVX:       # %bb.0:
2356; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2357; AVX-NEXT:    retq
2358  %a.cast = bitcast double %a to <2 x float>
2359  %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2360  %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2361  ret <4 x float> %shuffle
2362}
2363
2364define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
2365; SSE-LABEL: insert_mem_lo_v4f32:
2366; SSE:       # %bb.0:
2367; SSE-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2368; SSE-NEXT:    retq
2369;
2370; AVX-LABEL: insert_mem_lo_v4f32:
2371; AVX:       # %bb.0:
2372; AVX-NEXT:    vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2373; AVX-NEXT:    retq
2374  %a = load <2 x float>, <2 x float>* %ptr
2375  %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2376  %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2377  ret <4 x float> %shuffle
2378}
2379
2380define <4 x float> @insert_reg_hi_v4f32(double %a, <4 x float> %b) {
2381; SSE-LABEL: insert_reg_hi_v4f32:
2382; SSE:       # %bb.0:
2383; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2384; SSE-NEXT:    movaps %xmm1, %xmm0
2385; SSE-NEXT:    retq
2386;
2387; AVX-LABEL: insert_reg_hi_v4f32:
2388; AVX:       # %bb.0:
2389; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2390; AVX-NEXT:    retq
2391  %a.cast = bitcast double %a to <2 x float>
2392  %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2393  %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2394  ret <4 x float> %shuffle
2395}
2396
2397define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
2398; SSE-LABEL: insert_mem_hi_v4f32:
2399; SSE:       # %bb.0:
2400; SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
2401; SSE-NEXT:    retq
2402;
2403; AVX-LABEL: insert_mem_hi_v4f32:
2404; AVX:       # %bb.0:
2405; AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
2406; AVX-NEXT:    retq
2407  %a = load <2 x float>, <2 x float>* %ptr
2408  %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2409  %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2410  ret <4 x float> %shuffle
2411}
2412
2413; PR21137
2414define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
2415; SSE-LABEL: shuffle_mem_v4f32_3210:
2416; SSE:       # %bb.0:
2417; SSE-NEXT:    movaps (%rdi), %xmm0
2418; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
2419; SSE-NEXT:    retq
2420;
2421; AVX-LABEL: shuffle_mem_v4f32_3210:
2422; AVX:       # %bb.0:
2423; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
2424; AVX-NEXT:    retq
2425  %a = load <4 x float>, <4 x float>* %ptr
2426  %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
2427  ret <4 x float> %shuffle
2428}
2429
2430define <4 x i32> @insert_dup_mem_v4i32(i32* %ptr) {
2431; SSE-LABEL: insert_dup_mem_v4i32:
2432; SSE:       # %bb.0:
2433; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2434; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2435; SSE-NEXT:    retq
2436;
2437; AVX-LABEL: insert_dup_mem_v4i32:
2438; AVX:       # %bb.0:
2439; AVX-NEXT:    vbroadcastss (%rdi), %xmm0
2440; AVX-NEXT:    retq
2441  %tmp = load i32, i32* %ptr, align 4
2442  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
2443  %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
2444  ret <4 x i32> %tmp2
2445}
2446
2447; PR41249
2448define <4 x float> @shuffle_mem_pmovzx_v4f32(<2 x float>* %p0, <4 x float>* %p1) {
2449; SSE-LABEL: shuffle_mem_pmovzx_v4f32:
2450; SSE:       # %bb.0:
2451; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2452; SSE-NEXT:    xorps %xmm1, %xmm1
2453; SSE-NEXT:    movaps %xmm0, %xmm2
2454; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2455; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2456; SSE-NEXT:    movaps %xmm2, (%rsi)
2457; SSE-NEXT:    retq
2458;
2459; AVX1-LABEL: shuffle_mem_pmovzx_v4f32:
2460; AVX1:       # %bb.0:
2461; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
2462; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
2463; AVX1-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2464; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2465; AVX1-NEXT:    vmovaps %xmm1, (%rsi)
2466; AVX1-NEXT:    retq
2467;
2468; AVX2OR512VL-LABEL: shuffle_mem_pmovzx_v4f32:
2469; AVX2OR512VL:       # %bb.0:
2470; AVX2OR512VL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
2471; AVX2OR512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
2472; AVX2OR512VL-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2473; AVX2OR512VL-NEXT:    vbroadcastss %xmm0, %xmm0
2474; AVX2OR512VL-NEXT:    vmovaps %xmm1, (%rsi)
2475; AVX2OR512VL-NEXT:    retq
2476  %1 = load <2 x float>, <2 x float>* %p0
2477  %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
2478  %3 = shufflevector <4 x float> %2, <4 x float> <float undef, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
2479  %4 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> zeroinitializer
2480  store <4 x float> %3, <4 x float>* %p1
2481  ret <4 x float> %4
2482}
2483
2484;
2485; Shuffle to logical bit shifts
2486;
2487
2488define <4 x i32> @shuffle_v4i32_z0zX(<4 x i32> %a) {
2489; SSE-LABEL: shuffle_v4i32_z0zX:
2490; SSE:       # %bb.0:
2491; SSE-NEXT:    psllq $32, %xmm0
2492; SSE-NEXT:    retq
2493;
2494; AVX-LABEL: shuffle_v4i32_z0zX:
2495; AVX:       # %bb.0:
2496; AVX-NEXT:    vpsllq $32, %xmm0, %xmm0
2497; AVX-NEXT:    retq
2498  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 0, i32 4, i32 undef>
2499  ret <4 x i32> %shuffle
2500}
2501
2502define <4 x i32> @shuffle_v4i32_1z3z(<4 x i32> %a) {
2503; SSE-LABEL: shuffle_v4i32_1z3z:
2504; SSE:       # %bb.0:
2505; SSE-NEXT:    psrlq $32, %xmm0
2506; SSE-NEXT:    retq
2507;
2508; AVX-LABEL: shuffle_v4i32_1z3z:
2509; AVX:       # %bb.0:
2510; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm0
2511; AVX-NEXT:    retq
2512  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
2513  ret <4 x i32> %shuffle
2514}
2515
2516define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, <4 x float>* %pb) {
2517; SSE-LABEL: shuffle_mem_v4f32_0145:
2518; SSE:       # %bb.0:
2519; SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
2520; SSE-NEXT:    retq
2521;
2522; AVX-LABEL: shuffle_mem_v4f32_0145:
2523; AVX:       # %bb.0:
2524; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
2525; AVX-NEXT:    retq
2526  %b = load <4 x float>, <4 x float>* %pb, align 1
2527  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
2528  ret <4 x float> %shuffle
2529}
2530
2531define <4 x float> @shuffle_mem_v4f32_4523(<4 x float> %a, <4 x float>* %pb) {
2532; SSE2-LABEL: shuffle_mem_v4f32_4523:
2533; SSE2:       # %bb.0:
2534; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2535; SSE2-NEXT:    retq
2536;
2537; SSE3-LABEL: shuffle_mem_v4f32_4523:
2538; SSE3:       # %bb.0:
2539; SSE3-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2540; SSE3-NEXT:    retq
2541;
2542; SSSE3-LABEL: shuffle_mem_v4f32_4523:
2543; SSSE3:       # %bb.0:
2544; SSSE3-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2545; SSSE3-NEXT:    retq
2546;
2547; SSE41-LABEL: shuffle_mem_v4f32_4523:
2548; SSE41:       # %bb.0:
2549; SSE41-NEXT:    movups (%rdi), %xmm1
2550; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2551; SSE41-NEXT:    retq
2552;
2553; AVX-LABEL: shuffle_mem_v4f32_4523:
2554; AVX:       # %bb.0:
2555; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2556; AVX-NEXT:    retq
2557  %b = load <4 x float>, <4 x float>* %pb, align 1
2558  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
2559  ret <4 x float> %shuffle
2560}
2561
2562define  <4 x float> @shuffle_mem_v4f32_0624(<4 x float> %a0, <4 x float>* %a1) {
2563; SSE-LABEL: shuffle_mem_v4f32_0624:
2564; SSE:       # %bb.0:
2565; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2]
2566; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0,3,1]
2567; SSE-NEXT:    retq
2568;
2569; AVX1OR2-LABEL: shuffle_mem_v4f32_0624:
2570; AVX1OR2:       # %bb.0:
2571; AVX1OR2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2]
2572; AVX1OR2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1]
2573; AVX1OR2-NEXT:    retq
2574;
2575; AVX512VL-LABEL: shuffle_mem_v4f32_0624:
2576; AVX512VL:       # %bb.0:
2577; AVX512VL-NEXT:    vmovaps (%rdi), %xmm2
2578; AVX512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [0,6,2,4]
2579; AVX512VL-NEXT:    vpermi2ps %xmm0, %xmm2, %xmm1
2580; AVX512VL-NEXT:    vmovaps %xmm1, %xmm0
2581; AVX512VL-NEXT:    retq
2582  %1 = load <4 x float>, <4 x float>* %a1
2583  %2 = shufflevector <4 x float> %1, <4 x float> %a0, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
2584  ret <4 x float> %2
2585}
2586
2587define  <4 x float> @shuffle_mem_v4f32_4760(<4 x float> %a0, <4 x float>* %a1) {
2588; SSE-LABEL: shuffle_mem_v4f32_4760:
2589; SSE:       # %bb.0:
2590; SSE-NEXT:    movaps %xmm0, %xmm1
2591; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0]
2592; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2]
2593; SSE-NEXT:    retq
2594;
2595; AVX1OR2-LABEL: shuffle_mem_v4f32_4760:
2596; AVX1OR2:       # %bb.0:
2597; AVX1OR2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,0],mem[0,0]
2598; AVX1OR2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2]
2599; AVX1OR2-NEXT:    retq
2600;
2601; AVX512VL-LABEL: shuffle_mem_v4f32_4760:
2602; AVX512VL:       # %bb.0:
2603; AVX512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [0,3,2,4]
2604; AVX512VL-NEXT:    vpermt2ps (%rdi), %xmm1, %xmm0
2605; AVX512VL-NEXT:    retq
2606  %1 = load <4 x float>, <4 x float>* %a1
2607  %2 = shufflevector <4 x float> %1, <4 x float> %a0, <4 x i32> <i32 4, i32 7, i32 6, i32 0>
2608  ret <4 x float> %2
2609}
2610