1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX
7;
8; 32-bit SSE tests to make sure we do reasonable things.
9; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=X86-SSE,X86-SSE1
10; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X86-SSE,X86-SSE41
11
12define <2 x double> @merge_2f64_f64_23(double* %ptr) nounwind uwtable noinline ssp {
13; SSE-LABEL: merge_2f64_f64_23:
14; SSE:       # %bb.0:
15; SSE-NEXT:    movups 16(%rdi), %xmm0
16; SSE-NEXT:    retq
17;
18; AVX-LABEL: merge_2f64_f64_23:
19; AVX:       # %bb.0:
20; AVX-NEXT:    vmovups 16(%rdi), %xmm0
21; AVX-NEXT:    retq
22;
23; X86-SSE1-LABEL: merge_2f64_f64_23:
24; X86-SSE1:       # %bb.0:
25; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
26; X86-SSE1-NEXT:    fldl 16(%eax)
27; X86-SSE1-NEXT:    fldl 24(%eax)
28; X86-SSE1-NEXT:    fxch %st(1)
29; X86-SSE1-NEXT:    retl
30;
31; X86-SSE41-LABEL: merge_2f64_f64_23:
32; X86-SSE41:       # %bb.0:
33; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
34; X86-SSE41-NEXT:    movups 16(%eax), %xmm0
35; X86-SSE41-NEXT:    retl
36  %ptr0 = getelementptr inbounds double, double* %ptr, i64 2
37  %ptr1 = getelementptr inbounds double, double* %ptr, i64 3
38  %val0 = load double, double* %ptr0
39  %val1 = load double, double* %ptr1
40  %res0 = insertelement <2 x double> undef, double %val0, i32 0
41  %res1 = insertelement <2 x double> %res0, double %val1, i32 1
42  ret <2 x double> %res1
43}
44
45define <2 x i64> @merge_2i64_i64_12(i64* %ptr) nounwind uwtable noinline ssp {
46; SSE-LABEL: merge_2i64_i64_12:
47; SSE:       # %bb.0:
48; SSE-NEXT:    movups 8(%rdi), %xmm0
49; SSE-NEXT:    retq
50;
51; AVX-LABEL: merge_2i64_i64_12:
52; AVX:       # %bb.0:
53; AVX-NEXT:    vmovups 8(%rdi), %xmm0
54; AVX-NEXT:    retq
55;
56; X86-SSE1-LABEL: merge_2i64_i64_12:
57; X86-SSE1:       # %bb.0:
58; X86-SSE1-NEXT:    pushl %edi
59; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
60; X86-SSE1-NEXT:    pushl %esi
61; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
62; X86-SSE1-NEXT:    .cfi_offset %esi, -12
63; X86-SSE1-NEXT:    .cfi_offset %edi, -8
64; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
65; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
66; X86-SSE1-NEXT:    movl 8(%ecx), %edx
67; X86-SSE1-NEXT:    movl 12(%ecx), %esi
68; X86-SSE1-NEXT:    movl 16(%ecx), %edi
69; X86-SSE1-NEXT:    movl 20(%ecx), %ecx
70; X86-SSE1-NEXT:    movl %ecx, 12(%eax)
71; X86-SSE1-NEXT:    movl %edi, 8(%eax)
72; X86-SSE1-NEXT:    movl %esi, 4(%eax)
73; X86-SSE1-NEXT:    movl %edx, (%eax)
74; X86-SSE1-NEXT:    popl %esi
75; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
76; X86-SSE1-NEXT:    popl %edi
77; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
78; X86-SSE1-NEXT:    retl $4
79;
80; X86-SSE41-LABEL: merge_2i64_i64_12:
81; X86-SSE41:       # %bb.0:
82; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
83; X86-SSE41-NEXT:    movups 8(%eax), %xmm0
84; X86-SSE41-NEXT:    retl
85  %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
86  %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2
87  %val0 = load i64, i64* %ptr0
88  %val1 = load i64, i64* %ptr1
89  %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0
90  %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1
91  ret <2 x i64> %res1
92}
93
94define <4 x float> @merge_4f32_f32_2345(float* %ptr) nounwind uwtable noinline ssp {
95; SSE-LABEL: merge_4f32_f32_2345:
96; SSE:       # %bb.0:
97; SSE-NEXT:    movups 8(%rdi), %xmm0
98; SSE-NEXT:    retq
99;
100; AVX-LABEL: merge_4f32_f32_2345:
101; AVX:       # %bb.0:
102; AVX-NEXT:    vmovups 8(%rdi), %xmm0
103; AVX-NEXT:    retq
104;
105; X86-SSE-LABEL: merge_4f32_f32_2345:
106; X86-SSE:       # %bb.0:
107; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
108; X86-SSE-NEXT:    movups 8(%eax), %xmm0
109; X86-SSE-NEXT:    retl
110  %ptr0 = getelementptr inbounds float, float* %ptr, i64 2
111  %ptr1 = getelementptr inbounds float, float* %ptr, i64 3
112  %ptr2 = getelementptr inbounds float, float* %ptr, i64 4
113  %ptr3 = getelementptr inbounds float, float* %ptr, i64 5
114  %val0 = load float, float* %ptr0
115  %val1 = load float, float* %ptr1
116  %val2 = load float, float* %ptr2
117  %val3 = load float, float* %ptr3
118  %res0 = insertelement <4 x float> undef, float %val0, i32 0
119  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
120  %res2 = insertelement <4 x float> %res1, float %val2, i32 2
121  %res3 = insertelement <4 x float> %res2, float %val3, i32 3
122  ret <4 x float> %res3
123}
124
125define <4 x float> @merge_4f32_f32_3zuu(float* %ptr) nounwind uwtable noinline ssp {
126; SSE-LABEL: merge_4f32_f32_3zuu:
127; SSE:       # %bb.0:
128; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
129; SSE-NEXT:    retq
130;
131; AVX-LABEL: merge_4f32_f32_3zuu:
132; AVX:       # %bb.0:
133; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
134; AVX-NEXT:    retq
135;
136; X86-SSE-LABEL: merge_4f32_f32_3zuu:
137; X86-SSE:       # %bb.0:
138; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
139; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
140; X86-SSE-NEXT:    retl
141  %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
142  %val0 = load float, float* %ptr0
143  %res0 = insertelement <4 x float> undef, float %val0, i32 0
144  %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
145  ret <4 x float> %res1
146}
147
148define <4 x float> @merge_4f32_f32_34uu(float* %ptr) nounwind uwtable noinline ssp {
149; SSE-LABEL: merge_4f32_f32_34uu:
150; SSE:       # %bb.0:
151; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
152; SSE-NEXT:    retq
153;
154; AVX-LABEL: merge_4f32_f32_34uu:
155; AVX:       # %bb.0:
156; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
157; AVX-NEXT:    retq
158;
159; X86-SSE1-LABEL: merge_4f32_f32_34uu:
160; X86-SSE1:       # %bb.0:
161; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
162; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
163; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
164; X86-SSE1-NEXT:    retl
165;
166; X86-SSE41-LABEL: merge_4f32_f32_34uu:
167; X86-SSE41:       # %bb.0:
168; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
169; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
170; X86-SSE41-NEXT:    retl
171  %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
172  %ptr1 = getelementptr inbounds float, float* %ptr, i64 4
173  %val0 = load float, float* %ptr0
174  %val1 = load float, float* %ptr1
175  %res0 = insertelement <4 x float> undef, float %val0, i32 0
176  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
177  ret <4 x float> %res1
178}
179
180define <4 x float> @merge_4f32_f32_34z6(float* %ptr) nounwind uwtable noinline ssp {
181; SSE2-LABEL: merge_4f32_f32_34z6:
182; SSE2:       # %bb.0:
183; SSE2-NEXT:    movups 12(%rdi), %xmm0
184; SSE2-NEXT:    xorps %xmm1, %xmm1
185; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
186; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
187; SSE2-NEXT:    retq
188;
189; SSE41-LABEL: merge_4f32_f32_34z6:
190; SSE41:       # %bb.0:
191; SSE41-NEXT:    movups 12(%rdi), %xmm1
192; SSE41-NEXT:    xorps %xmm0, %xmm0
193; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
194; SSE41-NEXT:    retq
195;
196; AVX-LABEL: merge_4f32_f32_34z6:
197; AVX:       # %bb.0:
198; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
199; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2],mem[3]
200; AVX-NEXT:    retq
201;
202; X86-SSE1-LABEL: merge_4f32_f32_34z6:
203; X86-SSE1:       # %bb.0:
204; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
205; X86-SSE1-NEXT:    movups 12(%eax), %xmm0
206; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
207; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
208; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
209; X86-SSE1-NEXT:    retl
210;
211; X86-SSE41-LABEL: merge_4f32_f32_34z6:
212; X86-SSE41:       # %bb.0:
213; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
214; X86-SSE41-NEXT:    movups 12(%eax), %xmm1
215; X86-SSE41-NEXT:    xorps %xmm0, %xmm0
216; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
217; X86-SSE41-NEXT:    retl
218  %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
219  %ptr1 = getelementptr inbounds float, float* %ptr, i64 4
220  %ptr3 = getelementptr inbounds float, float* %ptr, i64 6
221  %val0 = load float, float* %ptr0
222  %val1 = load float, float* %ptr1
223  %val3 = load float, float* %ptr3
224  %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0
225  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
226  %res3 = insertelement <4 x float> %res1, float %val3, i32 3
227  ret <4 x float> %res3
228}
229
230define <4 x float> @merge_4f32_f32_45zz(float* %ptr) nounwind uwtable noinline ssp {
231; SSE-LABEL: merge_4f32_f32_45zz:
232; SSE:       # %bb.0:
233; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
234; SSE-NEXT:    retq
235;
236; AVX-LABEL: merge_4f32_f32_45zz:
237; AVX:       # %bb.0:
238; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
239; AVX-NEXT:    retq
240;
241; X86-SSE1-LABEL: merge_4f32_f32_45zz:
242; X86-SSE1:       # %bb.0:
243; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
244; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
245; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
246; X86-SSE1-NEXT:    retl
247;
248; X86-SSE41-LABEL: merge_4f32_f32_45zz:
249; X86-SSE41:       # %bb.0:
250; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
251; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
252; X86-SSE41-NEXT:    retl
253  %ptr0 = getelementptr inbounds float, float* %ptr, i64 4
254  %ptr1 = getelementptr inbounds float, float* %ptr, i64 5
255  %val0 = load float, float* %ptr0
256  %val1 = load float, float* %ptr1
257  %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0
258  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
259  ret <4 x float> %res1
260}
261
262define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline ssp {
263; SSE2-LABEL: merge_4f32_f32_012u:
264; SSE2:       # %bb.0:
265; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
266; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
267; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
268; SSE2-NEXT:    retq
269;
270; SSE41-LABEL: merge_4f32_f32_012u:
271; SSE41:       # %bb.0:
272; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
273; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
274; SSE41-NEXT:    retq
275;
276; AVX-LABEL: merge_4f32_f32_012u:
277; AVX:       # %bb.0:
278; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
279; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
280; AVX-NEXT:    retq
281;
282; X86-SSE1-LABEL: merge_4f32_f32_012u:
283; X86-SSE1:       # %bb.0:
284; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
285; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
286; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
287; X86-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
288; X86-SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
289; X86-SSE1-NEXT:    retl
290;
291; X86-SSE41-LABEL: merge_4f32_f32_012u:
292; X86-SSE41:       # %bb.0:
293; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
294; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
295; X86-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
296; X86-SSE41-NEXT:    retl
297  %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
298  %ptr1 = getelementptr inbounds float, float* %ptr, i64 1
299  %ptr2 = getelementptr inbounds float, float* %ptr, i64 2
300  %val0 = load float, float* %ptr0
301  %val1 = load float, float* %ptr1
302  %val2 = load float, float* %ptr2
303  %res0 = insertelement <4 x float> undef, float %val0, i32 0
304  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
305  %res2 = insertelement <4 x float> %res1, float %val2, i32 2
306  %res3 = insertelement <4 x float> %res2, float undef, i32 3
307  ret <4 x float> %res3
308}
309
310define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline ssp {
311; SSE2-LABEL: merge_4f32_f32_019u:
312; SSE2:       # %bb.0:
313; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
314; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
315; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
316; SSE2-NEXT:    retq
317;
318; SSE41-LABEL: merge_4f32_f32_019u:
319; SSE41:       # %bb.0:
320; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
321; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
322; SSE41-NEXT:    retq
323;
324; AVX-LABEL: merge_4f32_f32_019u:
325; AVX:       # %bb.0:
326; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
327; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
328; AVX-NEXT:    retq
329;
330; X86-SSE1-LABEL: merge_4f32_f32_019u:
331; X86-SSE1:       # %bb.0:
332; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
333; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
334; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
335; X86-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
336; X86-SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
337; X86-SSE1-NEXT:    retl
338;
339; X86-SSE41-LABEL: merge_4f32_f32_019u:
340; X86-SSE41:       # %bb.0:
341; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
342; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
343; X86-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
344; X86-SSE41-NEXT:    retl
345  %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
346  %ptr1 = getelementptr inbounds float, float* %ptr, i64 1
347  %ptr2 = getelementptr inbounds float, float* %ptr, i64 9
348  %val0 = load float, float* %ptr0
349  %val1 = load float, float* %ptr1
350  %val2 = load float, float* %ptr2
351  %res0 = insertelement <4 x float> undef, float %val0, i32 0
352  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
353  %res2 = insertelement <4 x float> %res1, float %val2, i32 2
354  %res3 = insertelement <4 x float> %res2, float undef, i32 3
355  ret <4 x float> %res3
356}
357
358define <4 x i32> @merge_4i32_i32_23u5(i32* %ptr) nounwind uwtable noinline ssp {
359; SSE-LABEL: merge_4i32_i32_23u5:
360; SSE:       # %bb.0:
361; SSE-NEXT:    movups 8(%rdi), %xmm0
362; SSE-NEXT:    retq
363;
364; AVX-LABEL: merge_4i32_i32_23u5:
365; AVX:       # %bb.0:
366; AVX-NEXT:    vmovups 8(%rdi), %xmm0
367; AVX-NEXT:    retq
368;
369; X86-SSE1-LABEL: merge_4i32_i32_23u5:
370; X86-SSE1:       # %bb.0:
371; X86-SSE1-NEXT:    pushl %esi
372; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
373; X86-SSE1-NEXT:    .cfi_offset %esi, -8
374; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
375; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
376; X86-SSE1-NEXT:    movl 8(%ecx), %edx
377; X86-SSE1-NEXT:    movl 12(%ecx), %esi
378; X86-SSE1-NEXT:    movl 20(%ecx), %ecx
379; X86-SSE1-NEXT:    movl %esi, 4(%eax)
380; X86-SSE1-NEXT:    movl %edx, (%eax)
381; X86-SSE1-NEXT:    movl %ecx, 12(%eax)
382; X86-SSE1-NEXT:    popl %esi
383; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
384; X86-SSE1-NEXT:    retl $4
385;
386; X86-SSE41-LABEL: merge_4i32_i32_23u5:
387; X86-SSE41:       # %bb.0:
388; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
389; X86-SSE41-NEXT:    movups 8(%eax), %xmm0
390; X86-SSE41-NEXT:    retl
391  %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
392  %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
393  %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
394  %val0 = load i32, i32* %ptr0
395  %val1 = load i32, i32* %ptr1
396  %val3 = load i32, i32* %ptr3
397  %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
398  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
399  %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
400  ret <4 x i32> %res3
401}
402
403define <4 x i32> @merge_4i32_i32_23u5_inc2(i32* %ptr) nounwind uwtable noinline ssp {
404; SSE-LABEL: merge_4i32_i32_23u5_inc2:
405; SSE:       # %bb.0:
406; SSE-NEXT:    movups 8(%rdi), %xmm0
407; SSE-NEXT:    incl 8(%rdi)
408; SSE-NEXT:    retq
409;
410; AVX-LABEL: merge_4i32_i32_23u5_inc2:
411; AVX:       # %bb.0:
412; AVX-NEXT:    vmovups 8(%rdi), %xmm0
413; AVX-NEXT:    incl 8(%rdi)
414; AVX-NEXT:    retq
415;
416; X86-SSE1-LABEL: merge_4i32_i32_23u5_inc2:
417; X86-SSE1:       # %bb.0:
418; X86-SSE1-NEXT:    pushl %edi
419; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
420; X86-SSE1-NEXT:    pushl %esi
421; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
422; X86-SSE1-NEXT:    .cfi_offset %esi, -12
423; X86-SSE1-NEXT:    .cfi_offset %edi, -8
424; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
425; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
426; X86-SSE1-NEXT:    movl 8(%ecx), %edx
427; X86-SSE1-NEXT:    movl 12(%ecx), %esi
428; X86-SSE1-NEXT:    leal 1(%edx), %edi
429; X86-SSE1-NEXT:    movl %edi, 8(%ecx)
430; X86-SSE1-NEXT:    movl 20(%ecx), %ecx
431; X86-SSE1-NEXT:    movl %esi, 4(%eax)
432; X86-SSE1-NEXT:    movl %edx, (%eax)
433; X86-SSE1-NEXT:    movl %ecx, 12(%eax)
434; X86-SSE1-NEXT:    popl %esi
435; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
436; X86-SSE1-NEXT:    popl %edi
437; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
438; X86-SSE1-NEXT:    retl $4
439;
440; X86-SSE41-LABEL: merge_4i32_i32_23u5_inc2:
441; X86-SSE41:       # %bb.0:
442; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
443; X86-SSE41-NEXT:    movups 8(%eax), %xmm0
444; X86-SSE41-NEXT:    incl 8(%eax)
445; X86-SSE41-NEXT:    retl
446  %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
447  %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
448  %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
449  %val0 = load i32, i32* %ptr0
450  %inc = add i32 %val0, 1
451  store i32 %inc, i32* %ptr0
452  %val1 = load i32, i32* %ptr1
453  %val3 = load i32, i32* %ptr3
454  %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
455  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
456  %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
457  ret <4 x i32> %res3
458}
459
460define <4 x i32> @merge_4i32_i32_23u5_inc3(i32* %ptr) nounwind uwtable noinline ssp {
461; SSE-LABEL: merge_4i32_i32_23u5_inc3:
462; SSE:       # %bb.0:
463; SSE-NEXT:    movups 8(%rdi), %xmm0
464; SSE-NEXT:    incl 12(%rdi)
465; SSE-NEXT:    retq
466;
467; AVX-LABEL: merge_4i32_i32_23u5_inc3:
468; AVX:       # %bb.0:
469; AVX-NEXT:    vmovups 8(%rdi), %xmm0
470; AVX-NEXT:    incl 12(%rdi)
471; AVX-NEXT:    retq
472;
473; X86-SSE1-LABEL: merge_4i32_i32_23u5_inc3:
474; X86-SSE1:       # %bb.0:
475; X86-SSE1-NEXT:    pushl %edi
476; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
477; X86-SSE1-NEXT:    pushl %esi
478; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
479; X86-SSE1-NEXT:    .cfi_offset %esi, -12
480; X86-SSE1-NEXT:    .cfi_offset %edi, -8
481; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
482; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
483; X86-SSE1-NEXT:    movl 8(%ecx), %edx
484; X86-SSE1-NEXT:    movl 12(%ecx), %esi
485; X86-SSE1-NEXT:    leal 1(%esi), %edi
486; X86-SSE1-NEXT:    movl %edi, 12(%ecx)
487; X86-SSE1-NEXT:    movl 20(%ecx), %ecx
488; X86-SSE1-NEXT:    movl %esi, 4(%eax)
489; X86-SSE1-NEXT:    movl %edx, (%eax)
490; X86-SSE1-NEXT:    movl %ecx, 12(%eax)
491; X86-SSE1-NEXT:    popl %esi
492; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
493; X86-SSE1-NEXT:    popl %edi
494; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
495; X86-SSE1-NEXT:    retl $4
496;
497; X86-SSE41-LABEL: merge_4i32_i32_23u5_inc3:
498; X86-SSE41:       # %bb.0:
499; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
500; X86-SSE41-NEXT:    movups 8(%eax), %xmm0
501; X86-SSE41-NEXT:    incl 12(%eax)
502; X86-SSE41-NEXT:    retl
503  %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
504  %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
505  %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
506  %val0 = load i32, i32* %ptr0
507  %val1 = load i32, i32* %ptr1
508  %inc = add i32 %val1, 1
509  store i32 %inc, i32* %ptr1
510  %val3 = load i32, i32* %ptr3
511  %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
512  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
513  %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
514  ret <4 x i32> %res3
515}
516
517define <4 x i32> @merge_4i32_i32_3zuu(i32* %ptr) nounwind uwtable noinline ssp {
518; SSE-LABEL: merge_4i32_i32_3zuu:
519; SSE:       # %bb.0:
520; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
521; SSE-NEXT:    retq
522;
523; AVX-LABEL: merge_4i32_i32_3zuu:
524; AVX:       # %bb.0:
525; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
526; AVX-NEXT:    retq
527;
528; X86-SSE1-LABEL: merge_4i32_i32_3zuu:
529; X86-SSE1:       # %bb.0:
530; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
531; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
532; X86-SSE1-NEXT:    movl 12(%ecx), %ecx
533; X86-SSE1-NEXT:    movl %ecx, (%eax)
534; X86-SSE1-NEXT:    movl $0, 4(%eax)
535; X86-SSE1-NEXT:    retl $4
536;
537; X86-SSE41-LABEL: merge_4i32_i32_3zuu:
538; X86-SSE41:       # %bb.0:
539; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
540; X86-SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
541; X86-SSE41-NEXT:    retl
542  %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 3
543  %val0 = load i32, i32* %ptr0
544  %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
545  %res1 = insertelement <4 x i32> %res0, i32     0, i32 1
546  ret <4 x i32> %res1
547}
548
549define <4 x i32> @merge_4i32_i32_34uu(i32* %ptr) nounwind uwtable noinline ssp {
550; SSE-LABEL: merge_4i32_i32_34uu:
551; SSE:       # %bb.0:
552; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
553; SSE-NEXT:    retq
554;
555; AVX-LABEL: merge_4i32_i32_34uu:
556; AVX:       # %bb.0:
557; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
558; AVX-NEXT:    retq
559;
560; X86-SSE1-LABEL: merge_4i32_i32_34uu:
561; X86-SSE1:       # %bb.0:
562; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
563; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
564; X86-SSE1-NEXT:    movl 12(%ecx), %edx
565; X86-SSE1-NEXT:    movl 16(%ecx), %ecx
566; X86-SSE1-NEXT:    movl %ecx, 4(%eax)
567; X86-SSE1-NEXT:    movl %edx, (%eax)
568; X86-SSE1-NEXT:    retl $4
569;
570; X86-SSE41-LABEL: merge_4i32_i32_34uu:
571; X86-SSE41:       # %bb.0:
572; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
573; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
574; X86-SSE41-NEXT:    retl
575  %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 3
576  %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 4
577  %val0 = load i32, i32* %ptr0
578  %val1 = load i32, i32* %ptr1
579  %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
580  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
581  ret <4 x i32> %res1
582}
583
584define <4 x i32> @merge_4i32_i32_45zz(i32* %ptr) nounwind uwtable noinline ssp {
585; SSE-LABEL: merge_4i32_i32_45zz:
586; SSE:       # %bb.0:
587; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
588; SSE-NEXT:    retq
589;
590; AVX-LABEL: merge_4i32_i32_45zz:
591; AVX:       # %bb.0:
592; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
593; AVX-NEXT:    retq
594;
595; X86-SSE1-LABEL: merge_4i32_i32_45zz:
596; X86-SSE1:       # %bb.0:
597; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
598; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
599; X86-SSE1-NEXT:    movl 16(%ecx), %edx
600; X86-SSE1-NEXT:    movl 20(%ecx), %ecx
601; X86-SSE1-NEXT:    movl %ecx, 4(%eax)
602; X86-SSE1-NEXT:    movl %edx, (%eax)
603; X86-SSE1-NEXT:    movl $0, 12(%eax)
604; X86-SSE1-NEXT:    movl $0, 8(%eax)
605; X86-SSE1-NEXT:    retl $4
606;
607; X86-SSE41-LABEL: merge_4i32_i32_45zz:
608; X86-SSE41:       # %bb.0:
609; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
610; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
611; X86-SSE41-NEXT:    retl
612  %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
613  %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
614  %val0 = load i32, i32* %ptr0
615  %val1 = load i32, i32* %ptr1
616  %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
617  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
618  ret <4 x i32> %res1
619}
620
621define <4 x i32> @merge_4i32_i32_45zz_inc4(i32* %ptr) nounwind uwtable noinline ssp {
622; SSE-LABEL: merge_4i32_i32_45zz_inc4:
623; SSE:       # %bb.0:
624; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
625; SSE-NEXT:    incl 16(%rdi)
626; SSE-NEXT:    retq
627;
628; AVX-LABEL: merge_4i32_i32_45zz_inc4:
629; AVX:       # %bb.0:
630; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
631; AVX-NEXT:    incl 16(%rdi)
632; AVX-NEXT:    retq
633;
634; X86-SSE1-LABEL: merge_4i32_i32_45zz_inc4:
635; X86-SSE1:       # %bb.0:
636; X86-SSE1-NEXT:    pushl %edi
637; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
638; X86-SSE1-NEXT:    pushl %esi
639; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
640; X86-SSE1-NEXT:    .cfi_offset %esi, -12
641; X86-SSE1-NEXT:    .cfi_offset %edi, -8
642; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
643; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
644; X86-SSE1-NEXT:    movl 16(%ecx), %edx
645; X86-SSE1-NEXT:    movl 20(%ecx), %esi
646; X86-SSE1-NEXT:    leal 1(%edx), %edi
647; X86-SSE1-NEXT:    movl %edi, 16(%ecx)
648; X86-SSE1-NEXT:    movl %esi, 4(%eax)
649; X86-SSE1-NEXT:    movl %edx, (%eax)
650; X86-SSE1-NEXT:    movl $0, 12(%eax)
651; X86-SSE1-NEXT:    movl $0, 8(%eax)
652; X86-SSE1-NEXT:    popl %esi
653; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
654; X86-SSE1-NEXT:    popl %edi
655; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
656; X86-SSE1-NEXT:    retl $4
657;
658; X86-SSE41-LABEL: merge_4i32_i32_45zz_inc4:
659; X86-SSE41:       # %bb.0:
660; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
661; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
662; X86-SSE41-NEXT:    incl 16(%eax)
663; X86-SSE41-NEXT:    retl
664  %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
665  %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
666  %val0 = load i32, i32* %ptr0
667  %inc = add i32 %val0, 1
668  store i32 %inc, i32* %ptr0
669  %val1 = load i32, i32* %ptr1
670  %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
671  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
672  ret <4 x i32> %res1
673}
674
675define <4 x i32> @merge_4i32_i32_45zz_inc5(i32* %ptr) nounwind uwtable noinline ssp {
676; SSE-LABEL: merge_4i32_i32_45zz_inc5:
677; SSE:       # %bb.0:
678; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
679; SSE-NEXT:    incl 20(%rdi)
680; SSE-NEXT:    retq
681;
682; AVX-LABEL: merge_4i32_i32_45zz_inc5:
683; AVX:       # %bb.0:
684; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
685; AVX-NEXT:    incl 20(%rdi)
686; AVX-NEXT:    retq
687;
688; X86-SSE1-LABEL: merge_4i32_i32_45zz_inc5:
689; X86-SSE1:       # %bb.0:
690; X86-SSE1-NEXT:    pushl %edi
691; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
692; X86-SSE1-NEXT:    pushl %esi
693; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
694; X86-SSE1-NEXT:    .cfi_offset %esi, -12
695; X86-SSE1-NEXT:    .cfi_offset %edi, -8
696; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
697; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
698; X86-SSE1-NEXT:    movl 16(%ecx), %edx
699; X86-SSE1-NEXT:    movl 20(%ecx), %esi
700; X86-SSE1-NEXT:    leal 1(%esi), %edi
701; X86-SSE1-NEXT:    movl %edi, 20(%ecx)
702; X86-SSE1-NEXT:    movl %esi, 4(%eax)
703; X86-SSE1-NEXT:    movl %edx, (%eax)
704; X86-SSE1-NEXT:    movl $0, 12(%eax)
705; X86-SSE1-NEXT:    movl $0, 8(%eax)
706; X86-SSE1-NEXT:    popl %esi
707; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
708; X86-SSE1-NEXT:    popl %edi
709; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
710; X86-SSE1-NEXT:    retl $4
711;
712; X86-SSE41-LABEL: merge_4i32_i32_45zz_inc5:
713; X86-SSE41:       # %bb.0:
714; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
715; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
716; X86-SSE41-NEXT:    incl 20(%eax)
717; X86-SSE41-NEXT:    retl
718  %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
719  %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
720  %val0 = load i32, i32* %ptr0
721  %val1 = load i32, i32* %ptr1
722  %inc = add i32 %val1, 1
723  store i32 %inc, i32* %ptr1
724  %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
725  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
726  ret <4 x i32> %res1
727}
728
729define <8 x i16> @merge_8i16_i16_23u567u9(i16* %ptr) nounwind uwtable noinline ssp {
730; SSE-LABEL: merge_8i16_i16_23u567u9:
731; SSE:       # %bb.0:
732; SSE-NEXT:    movups 4(%rdi), %xmm0
733; SSE-NEXT:    retq
734;
735; AVX-LABEL: merge_8i16_i16_23u567u9:
736; AVX:       # %bb.0:
737; AVX-NEXT:    vmovups 4(%rdi), %xmm0
738; AVX-NEXT:    retq
739;
740; X86-SSE1-LABEL: merge_8i16_i16_23u567u9:
741; X86-SSE1:       # %bb.0:
742; X86-SSE1-NEXT:    pushl %edi
743; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
744; X86-SSE1-NEXT:    pushl %esi
745; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
746; X86-SSE1-NEXT:    .cfi_offset %esi, -12
747; X86-SSE1-NEXT:    .cfi_offset %edi, -8
748; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
749; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
750; X86-SSE1-NEXT:    movl 4(%ecx), %edx
751; X86-SSE1-NEXT:    movl 10(%ecx), %esi
752; X86-SSE1-NEXT:    movzwl 14(%ecx), %edi
753; X86-SSE1-NEXT:    movzwl 18(%ecx), %ecx
754; X86-SSE1-NEXT:    movw %di, 10(%eax)
755; X86-SSE1-NEXT:    movw %cx, 14(%eax)
756; X86-SSE1-NEXT:    movl %esi, 6(%eax)
757; X86-SSE1-NEXT:    movl %edx, (%eax)
758; X86-SSE1-NEXT:    popl %esi
759; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
760; X86-SSE1-NEXT:    popl %edi
761; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
762; X86-SSE1-NEXT:    retl $4
763;
764; X86-SSE41-LABEL: merge_8i16_i16_23u567u9:
765; X86-SSE41:       # %bb.0:
766; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
767; X86-SSE41-NEXT:    movups 4(%eax), %xmm0
768; X86-SSE41-NEXT:    retl
769  %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2
770  %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3
771  %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 5
772  %ptr4 = getelementptr inbounds i16, i16* %ptr, i64 6
773  %ptr5 = getelementptr inbounds i16, i16* %ptr, i64 7
774  %ptr7 = getelementptr inbounds i16, i16* %ptr, i64 9
775  %val0 = load i16, i16* %ptr0
776  %val1 = load i16, i16* %ptr1
777  %val3 = load i16, i16* %ptr3
778  %val4 = load i16, i16* %ptr4
779  %val5 = load i16, i16* %ptr5
780  %val7 = load i16, i16* %ptr7
781  %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
782  %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
783  %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3
784  %res4 = insertelement <8 x i16> %res3, i16 %val4, i32 4
785  %res5 = insertelement <8 x i16> %res4, i16 %val5, i32 5
786  %res7 = insertelement <8 x i16> %res5, i16 %val7, i32 7
787  ret <8 x i16> %res7
788}
789
790define <8 x i16> @merge_8i16_i16_34uuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
791; SSE-LABEL: merge_8i16_i16_34uuuuuu:
792; SSE:       # %bb.0:
793; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
794; SSE-NEXT:    retq
795;
796; AVX-LABEL: merge_8i16_i16_34uuuuuu:
797; AVX:       # %bb.0:
798; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
799; AVX-NEXT:    retq
800;
801; X86-SSE1-LABEL: merge_8i16_i16_34uuuuuu:
802; X86-SSE1:       # %bb.0:
803; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
804; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
805; X86-SSE1-NEXT:    movl 6(%ecx), %ecx
806; X86-SSE1-NEXT:    movl %ecx, (%eax)
807; X86-SSE1-NEXT:    retl $4
808;
809; X86-SSE41-LABEL: merge_8i16_i16_34uuuuuu:
810; X86-SSE41:       # %bb.0:
811; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
812; X86-SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
813; X86-SSE41-NEXT:    retl
814  %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 3
815  %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 4
816  %val0 = load i16, i16* %ptr0
817  %val1 = load i16, i16* %ptr1
818  %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
819  %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
820  ret <8 x i16> %res1
821}
822
823define <8 x i16> @merge_8i16_i16_45u7zzzz(i16* %ptr) nounwind uwtable noinline ssp {
824; SSE-LABEL: merge_8i16_i16_45u7zzzz:
825; SSE:       # %bb.0:
826; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
827; SSE-NEXT:    retq
828;
829; AVX-LABEL: merge_8i16_i16_45u7zzzz:
830; AVX:       # %bb.0:
831; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
832; AVX-NEXT:    retq
833;
834; X86-SSE1-LABEL: merge_8i16_i16_45u7zzzz:
835; X86-SSE1:       # %bb.0:
836; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
837; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
838; X86-SSE1-NEXT:    movl 8(%ecx), %edx
839; X86-SSE1-NEXT:    movzwl 14(%ecx), %ecx
840; X86-SSE1-NEXT:    movw %cx, 6(%eax)
841; X86-SSE1-NEXT:    movl %edx, (%eax)
842; X86-SSE1-NEXT:    movl $0, 12(%eax)
843; X86-SSE1-NEXT:    movl $0, 8(%eax)
844; X86-SSE1-NEXT:    retl $4
845;
846; X86-SSE41-LABEL: merge_8i16_i16_45u7zzzz:
847; X86-SSE41:       # %bb.0:
848; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
849; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
850; X86-SSE41-NEXT:    retl
851  %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 4
852  %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 5
853  %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 7
854  %val0 = load i16, i16* %ptr0
855  %val1 = load i16, i16* %ptr1
856  %val3 = load i16, i16* %ptr3
857  %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
858  %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
859  %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3
860  %res4 = insertelement <8 x i16> %res3, i16     0, i32 4
861  %res5 = insertelement <8 x i16> %res4, i16     0, i32 5
862  %res6 = insertelement <8 x i16> %res5, i16     0, i32 6
863  %res7 = insertelement <8 x i16> %res6, i16     0, i32 7
864  ret <8 x i16> %res7
865}
866
867define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(i8* %ptr) nounwind uwtable noinline ssp {
868; SSE-LABEL: merge_16i8_i8_01u3456789ABCDuF:
869; SSE:       # %bb.0:
870; SSE-NEXT:    movups (%rdi), %xmm0
871; SSE-NEXT:    retq
872;
873; AVX-LABEL: merge_16i8_i8_01u3456789ABCDuF:
874; AVX:       # %bb.0:
875; AVX-NEXT:    vmovups (%rdi), %xmm0
876; AVX-NEXT:    retq
877;
878; X86-SSE1-LABEL: merge_16i8_i8_01u3456789ABCDuF:
879; X86-SSE1:       # %bb.0:
880; X86-SSE1-NEXT:    pushl %ebp
881; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
882; X86-SSE1-NEXT:    pushl %ebx
883; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
884; X86-SSE1-NEXT:    pushl %edi
885; X86-SSE1-NEXT:    .cfi_def_cfa_offset 16
886; X86-SSE1-NEXT:    pushl %esi
887; X86-SSE1-NEXT:    .cfi_def_cfa_offset 20
888; X86-SSE1-NEXT:    .cfi_offset %esi, -20
889; X86-SSE1-NEXT:    .cfi_offset %edi, -16
890; X86-SSE1-NEXT:    .cfi_offset %ebx, -12
891; X86-SSE1-NEXT:    .cfi_offset %ebp, -8
892; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
893; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
894; X86-SSE1-NEXT:    movzwl (%ecx), %ebp
895; X86-SSE1-NEXT:    movl 3(%ecx), %esi
896; X86-SSE1-NEXT:    movl 7(%ecx), %edi
897; X86-SSE1-NEXT:    movzwl 11(%ecx), %ebx
898; X86-SSE1-NEXT:    movb 13(%ecx), %dl
899; X86-SSE1-NEXT:    movb 15(%ecx), %cl
900; X86-SSE1-NEXT:    movb %dl, 13(%eax)
901; X86-SSE1-NEXT:    movb %cl, 15(%eax)
902; X86-SSE1-NEXT:    movw %bx, 11(%eax)
903; X86-SSE1-NEXT:    movl %edi, 7(%eax)
904; X86-SSE1-NEXT:    movl %esi, 3(%eax)
905; X86-SSE1-NEXT:    movw %bp, (%eax)
906; X86-SSE1-NEXT:    popl %esi
907; X86-SSE1-NEXT:    .cfi_def_cfa_offset 16
908; X86-SSE1-NEXT:    popl %edi
909; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
910; X86-SSE1-NEXT:    popl %ebx
911; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
912; X86-SSE1-NEXT:    popl %ebp
913; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
914; X86-SSE1-NEXT:    retl $4
915;
916; X86-SSE41-LABEL: merge_16i8_i8_01u3456789ABCDuF:
917; X86-SSE41:       # %bb.0:
918; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
919; X86-SSE41-NEXT:    movups (%eax), %xmm0
920; X86-SSE41-NEXT:    retl
921  %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
922  %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
923  %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
924  %ptr4 = getelementptr inbounds i8, i8* %ptr, i64 4
925  %ptr5 = getelementptr inbounds i8, i8* %ptr, i64 5
926  %ptr6 = getelementptr inbounds i8, i8* %ptr, i64 6
927  %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 7
928  %ptr8 = getelementptr inbounds i8, i8* %ptr, i64 8
929  %ptr9 = getelementptr inbounds i8, i8* %ptr, i64 9
930  %ptrA = getelementptr inbounds i8, i8* %ptr, i64 10
931  %ptrB = getelementptr inbounds i8, i8* %ptr, i64 11
932  %ptrC = getelementptr inbounds i8, i8* %ptr, i64 12
933  %ptrD = getelementptr inbounds i8, i8* %ptr, i64 13
934  %ptrF = getelementptr inbounds i8, i8* %ptr, i64 15
935  %val0 = load i8, i8* %ptr0
936  %val1 = load i8, i8* %ptr1
937  %val3 = load i8, i8* %ptr3
938  %val4 = load i8, i8* %ptr4
939  %val5 = load i8, i8* %ptr5
940  %val6 = load i8, i8* %ptr6
941  %val7 = load i8, i8* %ptr7
942  %val8 = load i8, i8* %ptr8
943  %val9 = load i8, i8* %ptr9
944  %valA = load i8, i8* %ptrA
945  %valB = load i8, i8* %ptrB
946  %valC = load i8, i8* %ptrC
947  %valD = load i8, i8* %ptrD
948  %valF = load i8, i8* %ptrF
949  %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
950  %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
951  %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3
952  %res4 = insertelement <16 x i8> %res3, i8 %val4, i32 4
953  %res5 = insertelement <16 x i8> %res4, i8 %val5, i32 5
954  %res6 = insertelement <16 x i8> %res5, i8 %val6, i32 6
955  %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7
956  %res8 = insertelement <16 x i8> %res7, i8 %val8, i32 8
957  %res9 = insertelement <16 x i8> %res8, i8 %val9, i32 9
958  %resA = insertelement <16 x i8> %res9, i8 %valA, i32 10
959  %resB = insertelement <16 x i8> %resA, i8 %valB, i32 11
960  %resC = insertelement <16 x i8> %resB, i8 %valC, i32 12
961  %resD = insertelement <16 x i8> %resC, i8 %valD, i32 13
962  %resF = insertelement <16 x i8> %resD, i8 %valF, i32 15
963  ret <16 x i8> %resF
964}
965
966define <16 x i8> @merge_16i8_i8_01u3uuzzuuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp {
967; SSE-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
968; SSE:       # %bb.0:
969; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
970; SSE-NEXT:    retq
971;
972; AVX-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
973; AVX:       # %bb.0:
974; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
975; AVX-NEXT:    retq
976;
977; X86-SSE1-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
978; X86-SSE1:       # %bb.0:
979; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
980; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
981; X86-SSE1-NEXT:    movzwl (%ecx), %edx
982; X86-SSE1-NEXT:    movb 3(%ecx), %cl
983; X86-SSE1-NEXT:    movb %cl, 3(%eax)
984; X86-SSE1-NEXT:    movw %dx, (%eax)
985; X86-SSE1-NEXT:    movb $0, 15(%eax)
986; X86-SSE1-NEXT:    movw $0, 13(%eax)
987; X86-SSE1-NEXT:    movw $0, 6(%eax)
988; X86-SSE1-NEXT:    retl $4
989;
990; X86-SSE41-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
991; X86-SSE41:       # %bb.0:
992; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
993; X86-SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
994; X86-SSE41-NEXT:    retl
995  %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
996  %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
997  %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
998  %val0 = load i8, i8* %ptr0
999  %val1 = load i8, i8* %ptr1
1000  %val3 = load i8, i8* %ptr3
1001  %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
1002  %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
1003  %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3
1004  %res6 = insertelement <16 x i8> %res3, i8     0, i32 6
1005  %res7 = insertelement <16 x i8> %res6, i8     0, i32 7
1006  %resD = insertelement <16 x i8> %res7, i8     0, i32 13
1007  %resE = insertelement <16 x i8> %resD, i8     0, i32 14
1008  %resF = insertelement <16 x i8> %resE, i8     0, i32 15
1009  ret <16 x i8> %resF
1010}
1011
1012define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp {
1013; SSE-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
1014; SSE:       # %bb.0:
1015; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1016; SSE-NEXT:    retq
1017;
1018; AVX-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
1019; AVX:       # %bb.0:
1020; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1021; AVX-NEXT:    retq
1022;
1023; X86-SSE1-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
1024; X86-SSE1:       # %bb.0:
1025; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
1026; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1027; X86-SSE1-NEXT:    movl (%ecx), %edx
1028; X86-SSE1-NEXT:    movzwl 6(%ecx), %ecx
1029; X86-SSE1-NEXT:    movw %cx, 6(%eax)
1030; X86-SSE1-NEXT:    movl %edx, (%eax)
1031; X86-SSE1-NEXT:    movb $0, 15(%eax)
1032; X86-SSE1-NEXT:    movw $0, 13(%eax)
1033; X86-SSE1-NEXT:    retl $4
1034;
1035; X86-SSE41-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
1036; X86-SSE41:       # %bb.0:
1037; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1038; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1039; X86-SSE41-NEXT:    retl
1040  %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
1041  %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
1042  %ptr2 = getelementptr inbounds i8, i8* %ptr, i64 2
1043  %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
1044  %ptr6 = getelementptr inbounds i8, i8* %ptr, i64 6
1045  %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 7
1046  %val0 = load i8, i8* %ptr0
1047  %val1 = load i8, i8* %ptr1
1048  %val2 = load i8, i8* %ptr2
1049  %val3 = load i8, i8* %ptr3
1050  %val6 = load i8, i8* %ptr6
1051  %val7 = load i8, i8* %ptr7
1052  %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
1053  %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
1054  %res2 = insertelement <16 x i8> %res1, i8 %val2, i32 2
1055  %res3 = insertelement <16 x i8> %res2, i8 %val3, i32 3
1056  %res6 = insertelement <16 x i8> %res3, i8 %val6, i32 6
1057  %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7
1058  %resD = insertelement <16 x i8> %res7, i8     0, i32 13
1059  %resE = insertelement <16 x i8> %resD, i8     0, i32 14
1060  %resF = insertelement <16 x i8> %resE, i8     0, i32 15
1061  ret <16 x i8> %resF
1062}
1063
1064define void @merge_4i32_i32_combine(<4 x i32>* %dst, i32* %src) {
1065; SSE-LABEL: merge_4i32_i32_combine:
1066; SSE:       # %bb.0:
1067; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1068; SSE-NEXT:    movaps %xmm0, (%rdi)
1069; SSE-NEXT:    retq
1070;
1071; AVX-LABEL: merge_4i32_i32_combine:
1072; AVX:       # %bb.0:
1073; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1074; AVX-NEXT:    vmovaps %xmm0, (%rdi)
1075; AVX-NEXT:    retq
1076;
1077; X86-SSE1-LABEL: merge_4i32_i32_combine:
1078; X86-SSE1:       # %bb.0:
1079; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
1080; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1081; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1082; X86-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1083; X86-SSE1-NEXT:    andps %xmm0, %xmm1
1084; X86-SSE1-NEXT:    movaps %xmm1, (%eax)
1085; X86-SSE1-NEXT:    retl
1086;
1087; X86-SSE41-LABEL: merge_4i32_i32_combine:
1088; X86-SSE41:       # %bb.0:
1089; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1090; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1091; X86-SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1092; X86-SSE41-NEXT:    movaps %xmm0, (%eax)
1093; X86-SSE41-NEXT:    retl
1094 %1 = getelementptr i32, i32* %src, i32 0
1095 %2 = load i32, i32* %1
1096 %3 = insertelement <4 x i32> undef, i32 %2, i32 0
1097 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
1098 %5 = lshr <4 x i32> %4, <i32 0, i32 undef, i32 undef, i32 undef>
1099 %6 = and <4 x i32> %5, <i32 -1, i32 0, i32 0, i32 0>
1100 store <4 x i32> %6, <4 x i32>* %dst
1101 ret void
1102}
1103
1104;
1105; consecutive loads including any/all volatiles may not be combined
1106;
1107
1108define <2 x i64> @merge_2i64_i64_12_volatile(i64* %ptr) nounwind uwtable noinline ssp {
1109; SSE-LABEL: merge_2i64_i64_12_volatile:
1110; SSE:       # %bb.0:
1111; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1112; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
1113; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1114; SSE-NEXT:    retq
1115;
1116; AVX-LABEL: merge_2i64_i64_12_volatile:
1117; AVX:       # %bb.0:
1118; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1119; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
1120; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1121; AVX-NEXT:    retq
1122;
1123; X86-SSE1-LABEL: merge_2i64_i64_12_volatile:
1124; X86-SSE1:       # %bb.0:
1125; X86-SSE1-NEXT:    pushl %edi
1126; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
1127; X86-SSE1-NEXT:    pushl %esi
1128; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
1129; X86-SSE1-NEXT:    .cfi_offset %esi, -12
1130; X86-SSE1-NEXT:    .cfi_offset %edi, -8
1131; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
1132; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1133; X86-SSE1-NEXT:    movl 8(%ecx), %edx
1134; X86-SSE1-NEXT:    movl 12(%ecx), %esi
1135; X86-SSE1-NEXT:    movl 16(%ecx), %edi
1136; X86-SSE1-NEXT:    movl 20(%ecx), %ecx
1137; X86-SSE1-NEXT:    movl %ecx, 12(%eax)
1138; X86-SSE1-NEXT:    movl %edi, 8(%eax)
1139; X86-SSE1-NEXT:    movl %esi, 4(%eax)
1140; X86-SSE1-NEXT:    movl %edx, (%eax)
1141; X86-SSE1-NEXT:    popl %esi
1142; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
1143; X86-SSE1-NEXT:    popl %edi
1144; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
1145; X86-SSE1-NEXT:    retl $4
1146;
1147; X86-SSE41-LABEL: merge_2i64_i64_12_volatile:
1148; X86-SSE41:       # %bb.0:
1149; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1150; X86-SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1151; X86-SSE41-NEXT:    pinsrd $1, 12(%eax), %xmm0
1152; X86-SSE41-NEXT:    pinsrd $2, 16(%eax), %xmm0
1153; X86-SSE41-NEXT:    pinsrd $3, 20(%eax), %xmm0
1154; X86-SSE41-NEXT:    retl
1155  %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
1156  %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2
1157  %val0 = load volatile i64, i64* %ptr0
1158  %val1 = load volatile i64, i64* %ptr1
1159  %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0
1160  %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1
1161  ret <2 x i64> %res1
1162}
1163
1164define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable noinline ssp {
1165; SSE2-LABEL: merge_4f32_f32_2345_volatile:
1166; SSE2:       # %bb.0:
1167; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1168; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1169; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1170; SSE2-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
1171; SSE2-NEXT:    retq
1172;
1173; SSE41-LABEL: merge_4f32_f32_2345_volatile:
1174; SSE41:       # %bb.0:
1175; SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1176; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
1177; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
1178; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
1179; SSE41-NEXT:    retq
1180;
1181; AVX-LABEL: merge_4f32_f32_2345_volatile:
1182; AVX:       # %bb.0:
1183; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1184; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
1185; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
1186; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
1187; AVX-NEXT:    retq
1188;
1189; X86-SSE1-LABEL: merge_4f32_f32_2345_volatile:
1190; X86-SSE1:       # %bb.0:
1191; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
1192; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1193; X86-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1194; X86-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1195; X86-SSE1-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
1196; X86-SSE1-NEXT:    retl
1197;
1198; X86-SSE41-LABEL: merge_4f32_f32_2345_volatile:
1199; X86-SSE41:       # %bb.0:
1200; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1201; X86-SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1202; X86-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
1203; X86-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
1204; X86-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
1205; X86-SSE41-NEXT:    retl
1206  %ptr0 = getelementptr inbounds float, float* %ptr, i64 2
1207  %ptr1 = getelementptr inbounds float, float* %ptr, i64 3
1208  %ptr2 = getelementptr inbounds float, float* %ptr, i64 4
1209  %ptr3 = getelementptr inbounds float, float* %ptr, i64 5
1210  %val0 = load volatile float, float* %ptr0
1211  %val1 = load float, float* %ptr1
1212  %val2 = load float, float* %ptr2
1213  %val3 = load float, float* %ptr3
1214  %res0 = insertelement <4 x float> undef, float %val0, i32 0
1215  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
1216  %res2 = insertelement <4 x float> %res1, float %val2, i32 2
1217  %res3 = insertelement <4 x float> %res2, float %val3, i32 3
1218  ret <4 x float> %res3
1219}
1220
1221;
1222; Non-consecutive test.
1223;
1224
1225define <4 x float> @merge_4f32_f32_X0YY(float* %ptr0, float* %ptr1) nounwind uwtable noinline ssp {
1226; SSE-LABEL: merge_4f32_f32_X0YY:
1227; SSE:       # %bb.0:
1228; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1229; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1230; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
1231; SSE-NEXT:    retq
1232;
1233; AVX-LABEL: merge_4f32_f32_X0YY:
1234; AVX:       # %bb.0:
1235; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1236; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1237; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
1238; AVX-NEXT:    retq
1239;
1240; X86-SSE-LABEL: merge_4f32_f32_X0YY:
1241; X86-SSE:       # %bb.0:
1242; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1243; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1244; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1245; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1246; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
1247; X86-SSE-NEXT:    retl
1248  %val0 = load float, float* %ptr0, align 4
1249  %val1 = load float, float* %ptr1, align 4
1250  %res0 = insertelement <4 x float> undef, float %val0, i32 0
1251  %res1 = insertelement <4 x float> %res0, float 0.000000e+00, i32 1
1252  %res2 = insertelement <4 x float> %res1, float %val1, i32 2
1253  %res3 = insertelement <4 x float> %res2, float %val1, i32 3
1254  ret <4 x float> %res3
1255}
1256
1257;
1258; Extension tests.
1259;
1260
1261; PR31309
1262define <4 x i32> @load_i32_zext_i128_v4i32(i32* %ptr) {
1263; SSE-LABEL: load_i32_zext_i128_v4i32:
1264; SSE:       # %bb.0:
1265; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1266; SSE-NEXT:    retq
1267;
1268; AVX-LABEL: load_i32_zext_i128_v4i32:
1269; AVX:       # %bb.0:
1270; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1271; AVX-NEXT:    retq
1272;
1273; X86-SSE1-LABEL: load_i32_zext_i128_v4i32:
1274; X86-SSE1:       # %bb.0:
1275; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
1276; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1277; X86-SSE1-NEXT:    movl (%ecx), %ecx
1278; X86-SSE1-NEXT:    movl %ecx, (%eax)
1279; X86-SSE1-NEXT:    movl $0, 12(%eax)
1280; X86-SSE1-NEXT:    movl $0, 8(%eax)
1281; X86-SSE1-NEXT:    movl $0, 4(%eax)
1282; X86-SSE1-NEXT:    retl $4
1283;
1284; X86-SSE41-LABEL: load_i32_zext_i128_v4i32:
1285; X86-SSE41:       # %bb.0:
1286; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1287; X86-SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1288; X86-SSE41-NEXT:    retl
1289  %1 = load i32, i32* %ptr
1290  %2 = zext i32 %1 to i128
1291  %3 = bitcast i128 %2 to <4 x i32>
1292  ret <4 x i32> %3
1293}
1294