1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSE3
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3,+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
6
7define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) {
8; SSE-LABEL: hadd_ps_test1:
9; SSE:       # BB#0:
10; SSE-NEXT:    haddps %xmm1, %xmm0
11; SSE-NEXT:    retq
12;
13; AVX-LABEL: hadd_ps_test1:
14; AVX:       # BB#0:
15; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
16; AVX-NEXT:    retq
17  %vecext = extractelement <4 x float> %A, i32 0
18  %vecext1 = extractelement <4 x float> %A, i32 1
19  %add = fadd float %vecext, %vecext1
20  %vecinit = insertelement <4 x float> undef, float %add, i32 0
21  %vecext2 = extractelement <4 x float> %A, i32 2
22  %vecext3 = extractelement <4 x float> %A, i32 3
23  %add4 = fadd float %vecext2, %vecext3
24  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
25  %vecext6 = extractelement <4 x float> %B, i32 0
26  %vecext7 = extractelement <4 x float> %B, i32 1
27  %add8 = fadd float %vecext6, %vecext7
28  %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
29  %vecext10 = extractelement <4 x float> %B, i32 2
30  %vecext11 = extractelement <4 x float> %B, i32 3
31  %add12 = fadd float %vecext10, %vecext11
32  %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
33  ret <4 x float> %vecinit13
34}
35
36define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) {
37; SSE-LABEL: hadd_ps_test2:
38; SSE:       # BB#0:
39; SSE-NEXT:    haddps %xmm1, %xmm0
40; SSE-NEXT:    retq
41;
42; AVX-LABEL: hadd_ps_test2:
43; AVX:       # BB#0:
44; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
45; AVX-NEXT:    retq
46  %vecext = extractelement <4 x float> %A, i32 2
47  %vecext1 = extractelement <4 x float> %A, i32 3
48  %add = fadd float %vecext, %vecext1
49  %vecinit = insertelement <4 x float> undef, float %add, i32 1
50  %vecext2 = extractelement <4 x float> %A, i32 0
51  %vecext3 = extractelement <4 x float> %A, i32 1
52  %add4 = fadd float %vecext2, %vecext3
53  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 0
54  %vecext6 = extractelement <4 x float> %B, i32 2
55  %vecext7 = extractelement <4 x float> %B, i32 3
56  %add8 = fadd float %vecext6, %vecext7
57  %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 3
58  %vecext10 = extractelement <4 x float> %B, i32 0
59  %vecext11 = extractelement <4 x float> %B, i32 1
60  %add12 = fadd float %vecext10, %vecext11
61  %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 2
62  ret <4 x float> %vecinit13
63}
64
65define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) {
66; SSE-LABEL: hsub_ps_test1:
67; SSE:       # BB#0:
68; SSE-NEXT:    hsubps %xmm1, %xmm0
69; SSE-NEXT:    retq
70;
71; AVX-LABEL: hsub_ps_test1:
72; AVX:       # BB#0:
73; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
74; AVX-NEXT:    retq
75  %vecext = extractelement <4 x float> %A, i32 0
76  %vecext1 = extractelement <4 x float> %A, i32 1
77  %sub = fsub float %vecext, %vecext1
78  %vecinit = insertelement <4 x float> undef, float %sub, i32 0
79  %vecext2 = extractelement <4 x float> %A, i32 2
80  %vecext3 = extractelement <4 x float> %A, i32 3
81  %sub4 = fsub float %vecext2, %vecext3
82  %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 1
83  %vecext6 = extractelement <4 x float> %B, i32 0
84  %vecext7 = extractelement <4 x float> %B, i32 1
85  %sub8 = fsub float %vecext6, %vecext7
86  %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 2
87  %vecext10 = extractelement <4 x float> %B, i32 2
88  %vecext11 = extractelement <4 x float> %B, i32 3
89  %sub12 = fsub float %vecext10, %vecext11
90  %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 3
91  ret <4 x float> %vecinit13
92}
93
94define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) {
95; SSE-LABEL: hsub_ps_test2:
96; SSE:       # BB#0:
97; SSE-NEXT:    hsubps %xmm1, %xmm0
98; SSE-NEXT:    retq
99;
100; AVX-LABEL: hsub_ps_test2:
101; AVX:       # BB#0:
102; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
103; AVX-NEXT:    retq
104  %vecext = extractelement <4 x float> %A, i32 2
105  %vecext1 = extractelement <4 x float> %A, i32 3
106  %sub = fsub float %vecext, %vecext1
107  %vecinit = insertelement <4 x float> undef, float %sub, i32 1
108  %vecext2 = extractelement <4 x float> %A, i32 0
109  %vecext3 = extractelement <4 x float> %A, i32 1
110  %sub4 = fsub float %vecext2, %vecext3
111  %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0
112  %vecext6 = extractelement <4 x float> %B, i32 2
113  %vecext7 = extractelement <4 x float> %B, i32 3
114  %sub8 = fsub float %vecext6, %vecext7
115  %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3
116  %vecext10 = extractelement <4 x float> %B, i32 0
117  %vecext11 = extractelement <4 x float> %B, i32 1
118  %sub12 = fsub float %vecext10, %vecext11
119  %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
120  ret <4 x float> %vecinit13
121}
122
123define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
124; SSE3-LABEL: phadd_d_test1:
125; SSE3:       # BB#0:
126; SSE3-NEXT:    movd %xmm0, %eax
127; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
128; SSE3-NEXT:    movd %xmm2, %ecx
129; SSE3-NEXT:    addl %eax, %ecx
130; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
131; SSE3-NEXT:    movd %xmm2, %eax
132; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
133; SSE3-NEXT:    movd %xmm0, %edx
134; SSE3-NEXT:    addl %eax, %edx
135; SSE3-NEXT:    movd %xmm1, %eax
136; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
137; SSE3-NEXT:    movd %xmm0, %esi
138; SSE3-NEXT:    addl %eax, %esi
139; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
140; SSE3-NEXT:    movd %xmm0, %eax
141; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
142; SSE3-NEXT:    movd %xmm0, %edi
143; SSE3-NEXT:    addl %eax, %edi
144; SSE3-NEXT:    movd %edi, %xmm0
145; SSE3-NEXT:    movd %edx, %xmm1
146; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
147; SSE3-NEXT:    movd %esi, %xmm2
148; SSE3-NEXT:    movd %ecx, %xmm0
149; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
150; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
151; SSE3-NEXT:    retq
152;
153; SSSE3-LABEL: phadd_d_test1:
154; SSSE3:       # BB#0:
155; SSSE3-NEXT:    phaddd %xmm1, %xmm0
156; SSSE3-NEXT:    retq
157;
158; AVX-LABEL: phadd_d_test1:
159; AVX:       # BB#0:
160; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
161; AVX-NEXT:    retq
162  %vecext = extractelement <4 x i32> %A, i32 0
163  %vecext1 = extractelement <4 x i32> %A, i32 1
164  %add = add i32 %vecext, %vecext1
165  %vecinit = insertelement <4 x i32> undef, i32 %add, i32 0
166  %vecext2 = extractelement <4 x i32> %A, i32 2
167  %vecext3 = extractelement <4 x i32> %A, i32 3
168  %add4 = add i32 %vecext2, %vecext3
169  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 1
170  %vecext6 = extractelement <4 x i32> %B, i32 0
171  %vecext7 = extractelement <4 x i32> %B, i32 1
172  %add8 = add i32 %vecext6, %vecext7
173  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 2
174  %vecext10 = extractelement <4 x i32> %B, i32 2
175  %vecext11 = extractelement <4 x i32> %B, i32 3
176  %add12 = add i32 %vecext10, %vecext11
177  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 3
178  ret <4 x i32> %vecinit13
179}
180
181define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
182; SSE3-LABEL: phadd_d_test2:
183; SSE3:       # BB#0:
184; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
185; SSE3-NEXT:    movd %xmm2, %eax
186; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
187; SSE3-NEXT:    movd %xmm2, %ecx
188; SSE3-NEXT:    addl %eax, %ecx
189; SSE3-NEXT:    movd %xmm0, %eax
190; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
191; SSE3-NEXT:    movd %xmm0, %edx
192; SSE3-NEXT:    addl %eax, %edx
193; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
194; SSE3-NEXT:    movd %xmm0, %eax
195; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
196; SSE3-NEXT:    movd %xmm0, %esi
197; SSE3-NEXT:    addl %eax, %esi
198; SSE3-NEXT:    movd %esi, %xmm0
199; SSE3-NEXT:    movd %ecx, %xmm2
200; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
201; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
202; SSE3-NEXT:    movd %xmm0, %eax
203; SSE3-NEXT:    movd %xmm1, %ecx
204; SSE3-NEXT:    addl %eax, %ecx
205; SSE3-NEXT:    movd %ecx, %xmm1
206; SSE3-NEXT:    movd %edx, %xmm0
207; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
208; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
209; SSE3-NEXT:    retq
210;
211; SSSE3-LABEL: phadd_d_test2:
212; SSSE3:       # BB#0:
213; SSSE3-NEXT:    phaddd %xmm1, %xmm0
214; SSSE3-NEXT:    retq
215;
216; AVX-LABEL: phadd_d_test2:
217; AVX:       # BB#0:
218; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
219; AVX-NEXT:    retq
220  %vecext = extractelement <4 x i32> %A, i32 2
221  %vecext1 = extractelement <4 x i32> %A, i32 3
222  %add = add i32 %vecext, %vecext1
223  %vecinit = insertelement <4 x i32> undef, i32 %add, i32 1
224  %vecext2 = extractelement <4 x i32> %A, i32 0
225  %vecext3 = extractelement <4 x i32> %A, i32 1
226  %add4 = add i32 %vecext2, %vecext3
227  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 0
228  %vecext6 = extractelement <4 x i32> %B, i32 3
229  %vecext7 = extractelement <4 x i32> %B, i32 2
230  %add8 = add i32 %vecext6, %vecext7
231  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 3
232  %vecext10 = extractelement <4 x i32> %B, i32 1
233  %vecext11 = extractelement <4 x i32> %B, i32 0
234  %add12 = add i32 %vecext10, %vecext11
235  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 2
236  ret <4 x i32> %vecinit13
237}
238
239define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
240; SSE3-LABEL: phsub_d_test1:
241; SSE3:       # BB#0:
242; SSE3-NEXT:    movd %xmm0, %eax
243; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
244; SSE3-NEXT:    movd %xmm2, %ecx
245; SSE3-NEXT:    subl %ecx, %eax
246; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
247; SSE3-NEXT:    movd %xmm2, %ecx
248; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
249; SSE3-NEXT:    movd %xmm0, %edx
250; SSE3-NEXT:    subl %edx, %ecx
251; SSE3-NEXT:    movd %xmm1, %edx
252; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
253; SSE3-NEXT:    movd %xmm0, %esi
254; SSE3-NEXT:    subl %esi, %edx
255; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
256; SSE3-NEXT:    movd %xmm0, %esi
257; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
258; SSE3-NEXT:    movd %xmm0, %edi
259; SSE3-NEXT:    subl %edi, %esi
260; SSE3-NEXT:    movd %esi, %xmm0
261; SSE3-NEXT:    movd %ecx, %xmm1
262; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
263; SSE3-NEXT:    movd %edx, %xmm2
264; SSE3-NEXT:    movd %eax, %xmm0
265; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
266; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
267; SSE3-NEXT:    retq
268;
269; SSSE3-LABEL: phsub_d_test1:
270; SSSE3:       # BB#0:
271; SSSE3-NEXT:    phsubd %xmm1, %xmm0
272; SSSE3-NEXT:    retq
273;
274; AVX-LABEL: phsub_d_test1:
275; AVX:       # BB#0:
276; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
277; AVX-NEXT:    retq
278  %vecext = extractelement <4 x i32> %A, i32 0
279  %vecext1 = extractelement <4 x i32> %A, i32 1
280  %sub = sub i32 %vecext, %vecext1
281  %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0
282  %vecext2 = extractelement <4 x i32> %A, i32 2
283  %vecext3 = extractelement <4 x i32> %A, i32 3
284  %sub4 = sub i32 %vecext2, %vecext3
285  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1
286  %vecext6 = extractelement <4 x i32> %B, i32 0
287  %vecext7 = extractelement <4 x i32> %B, i32 1
288  %sub8 = sub i32 %vecext6, %vecext7
289  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2
290  %vecext10 = extractelement <4 x i32> %B, i32 2
291  %vecext11 = extractelement <4 x i32> %B, i32 3
292  %sub12 = sub i32 %vecext10, %vecext11
293  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
294  ret <4 x i32> %vecinit13
295}
296
297define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
298; SSE3-LABEL: phsub_d_test2:
299; SSE3:       # BB#0:
300; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
301; SSE3-NEXT:    movd %xmm2, %eax
302; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
303; SSE3-NEXT:    movd %xmm2, %ecx
304; SSE3-NEXT:    subl %ecx, %eax
305; SSE3-NEXT:    movd %xmm0, %ecx
306; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
307; SSE3-NEXT:    movd %xmm0, %edx
308; SSE3-NEXT:    subl %edx, %ecx
309; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
310; SSE3-NEXT:    movd %xmm0, %edx
311; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
312; SSE3-NEXT:    movd %xmm0, %esi
313; SSE3-NEXT:    subl %esi, %edx
314; SSE3-NEXT:    movd %edx, %xmm0
315; SSE3-NEXT:    movd %eax, %xmm2
316; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
317; SSE3-NEXT:    movd %xmm1, %eax
318; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
319; SSE3-NEXT:    movd %xmm0, %edx
320; SSE3-NEXT:    subl %edx, %eax
321; SSE3-NEXT:    movd %eax, %xmm1
322; SSE3-NEXT:    movd %ecx, %xmm0
323; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
324; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
325; SSE3-NEXT:    retq
326;
327; SSSE3-LABEL: phsub_d_test2:
328; SSSE3:       # BB#0:
329; SSSE3-NEXT:    phsubd %xmm1, %xmm0
330; SSSE3-NEXT:    retq
331;
332; AVX-LABEL: phsub_d_test2:
333; AVX:       # BB#0:
334; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
335; AVX-NEXT:    retq
336  %vecext = extractelement <4 x i32> %A, i32 2
337  %vecext1 = extractelement <4 x i32> %A, i32 3
338  %sub = sub i32 %vecext, %vecext1
339  %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 1
340  %vecext2 = extractelement <4 x i32> %A, i32 0
341  %vecext3 = extractelement <4 x i32> %A, i32 1
342  %sub4 = sub i32 %vecext2, %vecext3
343  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 0
344  %vecext6 = extractelement <4 x i32> %B, i32 2
345  %vecext7 = extractelement <4 x i32> %B, i32 3
346  %sub8 = sub i32 %vecext6, %vecext7
347  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 3
348  %vecext10 = extractelement <4 x i32> %B, i32 0
349  %vecext11 = extractelement <4 x i32> %B, i32 1
350  %sub12 = sub i32 %vecext10, %vecext11
351  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 2
352  ret <4 x i32> %vecinit13
353}
354
355define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) {
356; SSE-LABEL: hadd_pd_test1:
357; SSE:       # BB#0:
358; SSE-NEXT:    haddpd %xmm1, %xmm0
359; SSE-NEXT:    retq
360;
361; AVX-LABEL: hadd_pd_test1:
362; AVX:       # BB#0:
363; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
364; AVX-NEXT:    retq
365  %vecext = extractelement <2 x double> %A, i32 0
366  %vecext1 = extractelement <2 x double> %A, i32 1
367  %add = fadd double %vecext, %vecext1
368  %vecinit = insertelement <2 x double> undef, double %add, i32 0
369  %vecext2 = extractelement <2 x double> %B, i32 0
370  %vecext3 = extractelement <2 x double> %B, i32 1
371  %add2 = fadd double %vecext2, %vecext3
372  %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
373  ret <2 x double> %vecinit2
374}
375
376define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) {
377; SSE-LABEL: hadd_pd_test2:
378; SSE:       # BB#0:
379; SSE-NEXT:    haddpd %xmm1, %xmm0
380; SSE-NEXT:    retq
381;
382; AVX-LABEL: hadd_pd_test2:
383; AVX:       # BB#0:
384; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
385; AVX-NEXT:    retq
386  %vecext = extractelement <2 x double> %A, i32 1
387  %vecext1 = extractelement <2 x double> %A, i32 0
388  %add = fadd double %vecext, %vecext1
389  %vecinit = insertelement <2 x double> undef, double %add, i32 0
390  %vecext2 = extractelement <2 x double> %B, i32 1
391  %vecext3 = extractelement <2 x double> %B, i32 0
392  %add2 = fadd double %vecext2, %vecext3
393  %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
394  ret <2 x double> %vecinit2
395}
396
397define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) {
398; SSE-LABEL: hsub_pd_test1:
399; SSE:       # BB#0:
400; SSE-NEXT:    hsubpd %xmm1, %xmm0
401; SSE-NEXT:    retq
402;
403; AVX-LABEL: hsub_pd_test1:
404; AVX:       # BB#0:
405; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
406; AVX-NEXT:    retq
407  %vecext = extractelement <2 x double> %A, i32 0
408  %vecext1 = extractelement <2 x double> %A, i32 1
409  %sub = fsub double %vecext, %vecext1
410  %vecinit = insertelement <2 x double> undef, double %sub, i32 0
411  %vecext2 = extractelement <2 x double> %B, i32 0
412  %vecext3 = extractelement <2 x double> %B, i32 1
413  %sub2 = fsub double %vecext2, %vecext3
414  %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 1
415  ret <2 x double> %vecinit2
416}
417
418define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) {
419; SSE-LABEL: hsub_pd_test2:
420; SSE:       # BB#0:
421; SSE-NEXT:    hsubpd %xmm1, %xmm0
422; SSE-NEXT:    retq
423;
424; AVX-LABEL: hsub_pd_test2:
425; AVX:       # BB#0:
426; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
427; AVX-NEXT:    retq
428  %vecext = extractelement <2 x double> %B, i32 0
429  %vecext1 = extractelement <2 x double> %B, i32 1
430  %sub = fsub double %vecext, %vecext1
431  %vecinit = insertelement <2 x double> undef, double %sub, i32 1
432  %vecext2 = extractelement <2 x double> %A, i32 0
433  %vecext3 = extractelement <2 x double> %A, i32 1
434  %sub2 = fsub double %vecext2, %vecext3
435  %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
436  ret <2 x double> %vecinit2
437}
438
439define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) {
440; SSE-LABEL: avx_vhadd_pd_test:
441; SSE:       # BB#0:
442; SSE-NEXT:    haddpd %xmm1, %xmm0
443; SSE-NEXT:    haddpd %xmm3, %xmm2
444; SSE-NEXT:    movapd %xmm2, %xmm1
445; SSE-NEXT:    retq
446;
447; AVX-LABEL: avx_vhadd_pd_test:
448; AVX:       # BB#0:
449; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
450; AVX-NEXT:    vhaddpd %xmm2, %xmm1, %xmm1
451; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
452; AVX-NEXT:    vhaddpd %xmm2, %xmm0, %xmm0
453; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
454; AVX-NEXT:    retq
455  %vecext = extractelement <4 x double> %A, i32 0
456  %vecext1 = extractelement <4 x double> %A, i32 1
457  %add = fadd double %vecext, %vecext1
458  %vecinit = insertelement <4 x double> undef, double %add, i32 0
459  %vecext2 = extractelement <4 x double> %A, i32 2
460  %vecext3 = extractelement <4 x double> %A, i32 3
461  %add4 = fadd double %vecext2, %vecext3
462  %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1
463  %vecext6 = extractelement <4 x double> %B, i32 0
464  %vecext7 = extractelement <4 x double> %B, i32 1
465  %add8 = fadd double %vecext6, %vecext7
466  %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2
467  %vecext10 = extractelement <4 x double> %B, i32 2
468  %vecext11 = extractelement <4 x double> %B, i32 3
469  %add12 = fadd double %vecext10, %vecext11
470  %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
471  ret <4 x double> %vecinit13
472}
473
474define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) {
475; SSE-LABEL: avx_vhsub_pd_test:
476; SSE:       # BB#0:
477; SSE-NEXT:    hsubpd %xmm1, %xmm0
478; SSE-NEXT:    hsubpd %xmm3, %xmm2
479; SSE-NEXT:    movapd %xmm2, %xmm1
480; SSE-NEXT:    retq
481;
482; AVX-LABEL: avx_vhsub_pd_test:
483; AVX:       # BB#0:
484; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
485; AVX-NEXT:    vhsubpd %xmm2, %xmm1, %xmm1
486; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
487; AVX-NEXT:    vhsubpd %xmm2, %xmm0, %xmm0
488; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
489; AVX-NEXT:    retq
490  %vecext = extractelement <4 x double> %A, i32 0
491  %vecext1 = extractelement <4 x double> %A, i32 1
492  %sub = fsub double %vecext, %vecext1
493  %vecinit = insertelement <4 x double> undef, double %sub, i32 0
494  %vecext2 = extractelement <4 x double> %A, i32 2
495  %vecext3 = extractelement <4 x double> %A, i32 3
496  %sub4 = fsub double %vecext2, %vecext3
497  %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1
498  %vecext6 = extractelement <4 x double> %B, i32 0
499  %vecext7 = extractelement <4 x double> %B, i32 1
500  %sub8 = fsub double %vecext6, %vecext7
501  %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2
502  %vecext10 = extractelement <4 x double> %B, i32 2
503  %vecext11 = extractelement <4 x double> %B, i32 3
504  %sub12 = fsub double %vecext10, %vecext11
505  %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
506  ret <4 x double> %vecinit13
507}
508
509define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
510; SSE3-LABEL: avx2_vphadd_d_test:
511; SSE3:       # BB#0:
512; SSE3-NEXT:    movd %xmm0, %ecx
513; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
514; SSE3-NEXT:    movd %xmm4, %r8d
515; SSE3-NEXT:    addl %ecx, %r8d
516; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
517; SSE3-NEXT:    movd %xmm4, %edx
518; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
519; SSE3-NEXT:    movd %xmm0, %r9d
520; SSE3-NEXT:    addl %edx, %r9d
521; SSE3-NEXT:    movd %xmm1, %esi
522; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
523; SSE3-NEXT:    movd %xmm0, %r10d
524; SSE3-NEXT:    addl %esi, %r10d
525; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
526; SSE3-NEXT:    movd %xmm0, %esi
527; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
528; SSE3-NEXT:    movd %xmm0, %edi
529; SSE3-NEXT:    addl %esi, %edi
530; SSE3-NEXT:    movd %xmm2, %eax
531; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
532; SSE3-NEXT:    movd %xmm0, %r11d
533; SSE3-NEXT:    addl %eax, %r11d
534; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
535; SSE3-NEXT:    movd %xmm0, %eax
536; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
537; SSE3-NEXT:    movd %xmm0, %ecx
538; SSE3-NEXT:    addl %eax, %ecx
539; SSE3-NEXT:    movd %xmm3, %eax
540; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
541; SSE3-NEXT:    movd %xmm0, %edx
542; SSE3-NEXT:    addl %eax, %edx
543; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
544; SSE3-NEXT:    movd %xmm0, %eax
545; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
546; SSE3-NEXT:    movd %xmm0, %esi
547; SSE3-NEXT:    addl %eax, %esi
548; SSE3-NEXT:    movd %edi, %xmm0
549; SSE3-NEXT:    movd %r9d, %xmm1
550; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
551; SSE3-NEXT:    movd %r10d, %xmm2
552; SSE3-NEXT:    movd %r8d, %xmm0
553; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
554; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
555; SSE3-NEXT:    movd %esi, %xmm1
556; SSE3-NEXT:    movd %ecx, %xmm2
557; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
558; SSE3-NEXT:    movd %edx, %xmm3
559; SSE3-NEXT:    movd %r11d, %xmm1
560; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
561; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
562; SSE3-NEXT:    retq
563;
564; SSSE3-LABEL: avx2_vphadd_d_test:
565; SSSE3:       # BB#0:
566; SSSE3-NEXT:    phaddd %xmm1, %xmm0
567; SSSE3-NEXT:    phaddd %xmm3, %xmm2
568; SSSE3-NEXT:    movdqa %xmm2, %xmm1
569; SSSE3-NEXT:    retq
570;
571; AVX1-LABEL: avx2_vphadd_d_test:
572; AVX1:       # BB#0:
573; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
574; AVX1-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
575; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
576; AVX1-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
577; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
578; AVX1-NEXT:    retq
579;
580; AVX2-LABEL: avx2_vphadd_d_test:
581; AVX2:       # BB#0:
582; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
583; AVX2-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
584; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
585; AVX2-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
586; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
587; AVX2-NEXT:    retq
588  %vecext = extractelement <8 x i32> %A, i32 0
589  %vecext1 = extractelement <8 x i32> %A, i32 1
590  %add = add i32 %vecext, %vecext1
591  %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
592  %vecext2 = extractelement <8 x i32> %A, i32 2
593  %vecext3 = extractelement <8 x i32> %A, i32 3
594  %add4 = add i32 %vecext2, %vecext3
595  %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
596  %vecext6 = extractelement <8 x i32> %A, i32 4
597  %vecext7 = extractelement <8 x i32> %A, i32 5
598  %add8 = add i32 %vecext6, %vecext7
599  %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2
600  %vecext10 = extractelement <8 x i32> %A, i32 6
601  %vecext11 = extractelement <8 x i32> %A, i32 7
602  %add12 = add i32 %vecext10, %vecext11
603  %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3
604  %vecext14 = extractelement <8 x i32> %B, i32 0
605  %vecext15 = extractelement <8 x i32> %B, i32 1
606  %add16 = add i32 %vecext14, %vecext15
607  %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4
608  %vecext18 = extractelement <8 x i32> %B, i32 2
609  %vecext19 = extractelement <8 x i32> %B, i32 3
610  %add20 = add i32 %vecext18, %vecext19
611  %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5
612  %vecext22 = extractelement <8 x i32> %B, i32 4
613  %vecext23 = extractelement <8 x i32> %B, i32 5
614  %add24 = add i32 %vecext22, %vecext23
615  %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6
616  %vecext26 = extractelement <8 x i32> %B, i32 6
617  %vecext27 = extractelement <8 x i32> %B, i32 7
618  %add28 = add i32 %vecext26, %vecext27
619  %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
620  ret <8 x i32> %vecinit29
621}
622
623define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
624; SSE3-LABEL: avx2_vphadd_w_test:
625; SSE3:       # BB#0:
626; SSE3-NEXT:    pushq %rbp
627; SSE3-NEXT:  .Ltmp0:
628; SSE3-NEXT:    .cfi_def_cfa_offset 16
629; SSE3-NEXT:    pushq %r15
630; SSE3-NEXT:  .Ltmp1:
631; SSE3-NEXT:    .cfi_def_cfa_offset 24
632; SSE3-NEXT:    pushq %r14
633; SSE3-NEXT:  .Ltmp2:
634; SSE3-NEXT:    .cfi_def_cfa_offset 32
635; SSE3-NEXT:    pushq %r13
636; SSE3-NEXT:  .Ltmp3:
637; SSE3-NEXT:    .cfi_def_cfa_offset 40
638; SSE3-NEXT:    pushq %r12
639; SSE3-NEXT:  .Ltmp4:
640; SSE3-NEXT:    .cfi_def_cfa_offset 48
641; SSE3-NEXT:    pushq %rbx
642; SSE3-NEXT:  .Ltmp5:
643; SSE3-NEXT:    .cfi_def_cfa_offset 56
644; SSE3-NEXT:  .Ltmp6:
645; SSE3-NEXT:    .cfi_offset %rbx, -56
646; SSE3-NEXT:  .Ltmp7:
647; SSE3-NEXT:    .cfi_offset %r12, -48
648; SSE3-NEXT:  .Ltmp8:
649; SSE3-NEXT:    .cfi_offset %r13, -40
650; SSE3-NEXT:  .Ltmp9:
651; SSE3-NEXT:    .cfi_offset %r14, -32
652; SSE3-NEXT:  .Ltmp10:
653; SSE3-NEXT:    .cfi_offset %r15, -24
654; SSE3-NEXT:  .Ltmp11:
655; SSE3-NEXT:    .cfi_offset %rbp, -16
656; SSE3-NEXT:    movd %xmm0, %eax
657; SSE3-NEXT:    pextrw $1, %xmm0, %ecx
658; SSE3-NEXT:    addl %eax, %ecx
659; SSE3-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
660; SSE3-NEXT:    pextrw $2, %xmm0, %eax
661; SSE3-NEXT:    pextrw $3, %xmm0, %r11d
662; SSE3-NEXT:    addl %eax, %r11d
663; SSE3-NEXT:    pextrw $4, %xmm0, %eax
664; SSE3-NEXT:    pextrw $5, %xmm0, %r10d
665; SSE3-NEXT:    addl %eax, %r10d
666; SSE3-NEXT:    pextrw $6, %xmm0, %eax
667; SSE3-NEXT:    pextrw $7, %xmm0, %r13d
668; SSE3-NEXT:    addl %eax, %r13d
669; SSE3-NEXT:    movd %xmm1, %eax
670; SSE3-NEXT:    pextrw $1, %xmm1, %r14d
671; SSE3-NEXT:    addl %eax, %r14d
672; SSE3-NEXT:    pextrw $2, %xmm1, %eax
673; SSE3-NEXT:    pextrw $3, %xmm1, %ebp
674; SSE3-NEXT:    addl %eax, %ebp
675; SSE3-NEXT:    pextrw $4, %xmm1, %eax
676; SSE3-NEXT:    pextrw $5, %xmm1, %ebx
677; SSE3-NEXT:    addl %eax, %ebx
678; SSE3-NEXT:    pextrw $6, %xmm1, %eax
679; SSE3-NEXT:    pextrw $7, %xmm1, %edx
680; SSE3-NEXT:    addl %eax, %edx
681; SSE3-NEXT:    movd %xmm2, %eax
682; SSE3-NEXT:    pextrw $1, %xmm2, %ecx
683; SSE3-NEXT:    addl %eax, %ecx
684; SSE3-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
685; SSE3-NEXT:    pextrw $2, %xmm2, %eax
686; SSE3-NEXT:    pextrw $3, %xmm2, %r12d
687; SSE3-NEXT:    addl %eax, %r12d
688; SSE3-NEXT:    pextrw $4, %xmm2, %eax
689; SSE3-NEXT:    pextrw $5, %xmm2, %r15d
690; SSE3-NEXT:    addl %eax, %r15d
691; SSE3-NEXT:    pextrw $6, %xmm2, %eax
692; SSE3-NEXT:    pextrw $7, %xmm2, %r8d
693; SSE3-NEXT:    addl %eax, %r8d
694; SSE3-NEXT:    movd %xmm3, %eax
695; SSE3-NEXT:    pextrw $1, %xmm3, %r9d
696; SSE3-NEXT:    addl %eax, %r9d
697; SSE3-NEXT:    pextrw $2, %xmm3, %eax
698; SSE3-NEXT:    pextrw $3, %xmm3, %esi
699; SSE3-NEXT:    addl %eax, %esi
700; SSE3-NEXT:    pextrw $4, %xmm3, %eax
701; SSE3-NEXT:    pextrw $5, %xmm3, %edi
702; SSE3-NEXT:    addl %eax, %edi
703; SSE3-NEXT:    pextrw $6, %xmm3, %ecx
704; SSE3-NEXT:    pextrw $7, %xmm3, %eax
705; SSE3-NEXT:    addl %ecx, %eax
706; SSE3-NEXT:    movd %edx, %xmm8
707; SSE3-NEXT:    movd %r13d, %xmm3
708; SSE3-NEXT:    movd %ebp, %xmm9
709; SSE3-NEXT:    movd %r11d, %xmm4
710; SSE3-NEXT:    movd %ebx, %xmm10
711; SSE3-NEXT:    movd %r10d, %xmm7
712; SSE3-NEXT:    movd %r14d, %xmm11
713; SSE3-NEXT:    movd -{{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
714; SSE3-NEXT:    # xmm0 = mem[0],zero,zero,zero
715; SSE3-NEXT:    movd %eax, %xmm12
716; SSE3-NEXT:    movd %r8d, %xmm6
717; SSE3-NEXT:    movd %esi, %xmm13
718; SSE3-NEXT:    movd %r12d, %xmm5
719; SSE3-NEXT:    movd %edi, %xmm14
720; SSE3-NEXT:    movd %r15d, %xmm2
721; SSE3-NEXT:    movd %r9d, %xmm15
722; SSE3-NEXT:    movd -{{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload
723; SSE3-NEXT:    # xmm1 = mem[0],zero,zero,zero
724; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
725; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
726; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
727; SSE3-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
728; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
729; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
730; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
731; SSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
732; SSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
733; SSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
734; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
735; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
736; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
737; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
738; SSE3-NEXT:    popq %rbx
739; SSE3-NEXT:    popq %r12
740; SSE3-NEXT:    popq %r13
741; SSE3-NEXT:    popq %r14
742; SSE3-NEXT:    popq %r15
743; SSE3-NEXT:    popq %rbp
744; SSE3-NEXT:    retq
745;
746; SSSE3-LABEL: avx2_vphadd_w_test:
747; SSSE3:       # BB#0:
748; SSSE3-NEXT:    phaddw %xmm1, %xmm0
749; SSSE3-NEXT:    phaddw %xmm3, %xmm2
750; SSSE3-NEXT:    movdqa %xmm2, %xmm1
751; SSSE3-NEXT:    retq
752;
753; AVX1-LABEL: avx2_vphadd_w_test:
754; AVX1:       # BB#0:
755; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
756; AVX1-NEXT:    vphaddw %xmm2, %xmm1, %xmm1
757; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
758; AVX1-NEXT:    vphaddw %xmm2, %xmm0, %xmm0
759; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
760; AVX1-NEXT:    retq
761;
762; AVX2-LABEL: avx2_vphadd_w_test:
763; AVX2:       # BB#0:
764; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
765; AVX2-NEXT:    vphaddw %xmm2, %xmm1, %xmm1
766; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
767; AVX2-NEXT:    vphaddw %xmm2, %xmm0, %xmm0
768; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
769; AVX2-NEXT:    retq
770  %vecext = extractelement <16 x i16> %a, i32 0
771  %vecext1 = extractelement <16 x i16> %a, i32 1
772  %add = add i16 %vecext, %vecext1
773  %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0
774  %vecext4 = extractelement <16 x i16> %a, i32 2
775  %vecext6 = extractelement <16 x i16> %a, i32 3
776  %add8 = add i16 %vecext4, %vecext6
777  %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1
778  %vecext11 = extractelement <16 x i16> %a, i32 4
779  %vecext13 = extractelement <16 x i16> %a, i32 5
780  %add15 = add i16 %vecext11, %vecext13
781  %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2
782  %vecext18 = extractelement <16 x i16> %a, i32 6
783  %vecext20 = extractelement <16 x i16> %a, i32 7
784  %add22 = add i16 %vecext18, %vecext20
785  %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3
786  %vecext25 = extractelement <16 x i16> %a, i32 8
787  %vecext27 = extractelement <16 x i16> %a, i32 9
788  %add29 = add i16 %vecext25, %vecext27
789  %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 4
790  %vecext32 = extractelement <16 x i16> %a, i32 10
791  %vecext34 = extractelement <16 x i16> %a, i32 11
792  %add36 = add i16 %vecext32, %vecext34
793  %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 5
794  %vecext39 = extractelement <16 x i16> %a, i32 12
795  %vecext41 = extractelement <16 x i16> %a, i32 13
796  %add43 = add i16 %vecext39, %vecext41
797  %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 6
798  %vecext46 = extractelement <16 x i16> %a, i32 14
799  %vecext48 = extractelement <16 x i16> %a, i32 15
800  %add50 = add i16 %vecext46, %vecext48
801  %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 7
802  %vecext53 = extractelement <16 x i16> %b, i32 0
803  %vecext55 = extractelement <16 x i16> %b, i32 1
804  %add57 = add i16 %vecext53, %vecext55
805  %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 8
806  %vecext60 = extractelement <16 x i16> %b, i32 2
807  %vecext62 = extractelement <16 x i16> %b, i32 3
808  %add64 = add i16 %vecext60, %vecext62
809  %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 9
810  %vecext67 = extractelement <16 x i16> %b, i32 4
811  %vecext69 = extractelement <16 x i16> %b, i32 5
812  %add71 = add i16 %vecext67, %vecext69
813  %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 10
814  %vecext74 = extractelement <16 x i16> %b, i32 6
815  %vecext76 = extractelement <16 x i16> %b, i32 7
816  %add78 = add i16 %vecext74, %vecext76
817  %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 11
818  %vecext81 = extractelement <16 x i16> %b, i32 8
819  %vecext83 = extractelement <16 x i16> %b, i32 9
820  %add85 = add i16 %vecext81, %vecext83
821  %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12
822  %vecext88 = extractelement <16 x i16> %b, i32 10
823  %vecext90 = extractelement <16 x i16> %b, i32 11
824  %add92 = add i16 %vecext88, %vecext90
825  %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13
826  %vecext95 = extractelement <16 x i16> %b, i32 12
827  %vecext97 = extractelement <16 x i16> %b, i32 13
828  %add99 = add i16 %vecext95, %vecext97
829  %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14
830  %vecext102 = extractelement <16 x i16> %b, i32 14
831  %vecext104 = extractelement <16 x i16> %b, i32 15
832  %add106 = add i16 %vecext102, %vecext104
833  %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
834  ret <16 x i16> %vecinit108
835}
836
837; Verify that we don't select horizontal subs in the following functions.
838
839define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
840; SSE-LABEL: not_a_hsub_1:
841; SSE:       # BB#0:
842; SSE-NEXT:    movd %xmm0, %eax
843; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
844; SSE-NEXT:    movd %xmm2, %ecx
845; SSE-NEXT:    subl %ecx, %eax
846; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
847; SSE-NEXT:    movd %xmm2, %ecx
848; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
849; SSE-NEXT:    movd %xmm0, %edx
850; SSE-NEXT:    subl %edx, %ecx
851; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
852; SSE-NEXT:    movd %xmm0, %edx
853; SSE-NEXT:    movd %xmm1, %esi
854; SSE-NEXT:    subl %esi, %edx
855; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
856; SSE-NEXT:    movd %xmm0, %esi
857; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
858; SSE-NEXT:    movd %xmm0, %edi
859; SSE-NEXT:    subl %edi, %esi
860; SSE-NEXT:    movd %esi, %xmm0
861; SSE-NEXT:    movd %ecx, %xmm1
862; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
863; SSE-NEXT:    movd %edx, %xmm2
864; SSE-NEXT:    movd %eax, %xmm0
865; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
866; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
867; SSE-NEXT:    retq
868;
869; AVX-LABEL: not_a_hsub_1:
870; AVX:       # BB#0:
871; AVX-NEXT:    vmovd %xmm0, %eax
872; AVX-NEXT:    vpextrd $1, %xmm0, %ecx
873; AVX-NEXT:    subl %ecx, %eax
874; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
875; AVX-NEXT:    vpextrd $3, %xmm0, %edx
876; AVX-NEXT:    subl %edx, %ecx
877; AVX-NEXT:    vpextrd $1, %xmm1, %edx
878; AVX-NEXT:    vmovd %xmm1, %esi
879; AVX-NEXT:    subl %esi, %edx
880; AVX-NEXT:    vpextrd $3, %xmm1, %esi
881; AVX-NEXT:    vpextrd $2, %xmm1, %edi
882; AVX-NEXT:    subl %edi, %esi
883; AVX-NEXT:    vmovd %eax, %xmm0
884; AVX-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
885; AVX-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
886; AVX-NEXT:    vpinsrd $3, %esi, %xmm0, %xmm0
887; AVX-NEXT:    retq
888  %vecext = extractelement <4 x i32> %A, i32 0
889  %vecext1 = extractelement <4 x i32> %A, i32 1
890  %sub = sub i32 %vecext, %vecext1
891  %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0
892  %vecext2 = extractelement <4 x i32> %A, i32 2
893  %vecext3 = extractelement <4 x i32> %A, i32 3
894  %sub4 = sub i32 %vecext2, %vecext3
895  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1
896  %vecext6 = extractelement <4 x i32> %B, i32 1
897  %vecext7 = extractelement <4 x i32> %B, i32 0
898  %sub8 = sub i32 %vecext6, %vecext7
899  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2
900  %vecext10 = extractelement <4 x i32> %B, i32 3
901  %vecext11 = extractelement <4 x i32> %B, i32 2
902  %sub12 = sub i32 %vecext10, %vecext11
903  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
904  ret <4 x i32> %vecinit13
905}
906
907define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
908; SSE-LABEL: not_a_hsub_2:
909; SSE:       # BB#0:
910; SSE-NEXT:    movapd %xmm0, %xmm2
911; SSE-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[1,0]
912; SSE-NEXT:    movapd %xmm0, %xmm3
913; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
914; SSE-NEXT:    subss %xmm3, %xmm2
915; SSE-NEXT:    movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
916; SSE-NEXT:    subss %xmm3, %xmm0
917; SSE-NEXT:    movaps %xmm1, %xmm3
918; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
919; SSE-NEXT:    movaps %xmm1, %xmm4
920; SSE-NEXT:    shufpd {{.*#+}} xmm4 = xmm4[1,0]
921; SSE-NEXT:    subss %xmm4, %xmm3
922; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
923; SSE-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
924; SSE-NEXT:    subss %xmm3, %xmm1
925; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
926; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
927; SSE-NEXT:    retq
928;
929; AVX-LABEL: not_a_hsub_2:
930; AVX:       # BB#0:
931; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
932; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3]
933; AVX-NEXT:    vsubss %xmm3, %xmm2, %xmm2
934; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
935; AVX-NEXT:    vsubss %xmm3, %xmm0, %xmm0
936; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
937; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
938; AVX-NEXT:    vsubss %xmm4, %xmm3, %xmm3
939; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
940; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
941; AVX-NEXT:    vsubss %xmm2, %xmm1, %xmm1
942; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
943; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
944; AVX-NEXT:    retq
945  %vecext = extractelement <4 x float> %A, i32 2
946  %vecext1 = extractelement <4 x float> %A, i32 3
947  %sub = fsub float %vecext, %vecext1
948  %vecinit = insertelement <4 x float> undef, float %sub, i32 1
949  %vecext2 = extractelement <4 x float> %A, i32 0
950  %vecext3 = extractelement <4 x float> %A, i32 1
951  %sub4 = fsub float %vecext2, %vecext3
952  %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0
953  %vecext6 = extractelement <4 x float> %B, i32 3
954  %vecext7 = extractelement <4 x float> %B, i32 2
955  %sub8 = fsub float %vecext6, %vecext7
956  %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3
957  %vecext10 = extractelement <4 x float> %B, i32 0
958  %vecext11 = extractelement <4 x float> %B, i32 1
959  %sub12 = fsub float %vecext10, %vecext11
960  %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
961  ret <4 x float> %vecinit13
962}
963
964define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) {
965; SSE-LABEL: not_a_hsub_3:
966; SSE:       # BB#0:
967; SSE-NEXT:    movapd %xmm1, %xmm2
968; SSE-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[1,0]
969; SSE-NEXT:    subsd %xmm2, %xmm1
970; SSE-NEXT:    movapd %xmm0, %xmm2
971; SSE-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[1,0]
972; SSE-NEXT:    subsd %xmm0, %xmm2
973; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
974; SSE-NEXT:    movapd %xmm2, %xmm0
975; SSE-NEXT:    retq
976;
977; AVX-LABEL: not_a_hsub_3:
978; AVX:       # BB#0:
979; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
980; AVX-NEXT:    vsubsd %xmm2, %xmm1, %xmm1
981; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
982; AVX-NEXT:    vsubsd %xmm0, %xmm2, %xmm0
983; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
984; AVX-NEXT:    retq
985  %vecext = extractelement <2 x double> %B, i32 0
986  %vecext1 = extractelement <2 x double> %B, i32 1
987  %sub = fsub double %vecext, %vecext1
988  %vecinit = insertelement <2 x double> undef, double %sub, i32 1
989  %vecext2 = extractelement <2 x double> %A, i32 1
990  %vecext3 = extractelement <2 x double> %A, i32 0
991  %sub2 = fsub double %vecext2, %vecext3
992  %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
993  ret <2 x double> %vecinit2
994}
995
996; Test AVX horizontal add/sub of packed single/double precision
997; floating point values from 256-bit vectors.
998
999define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) {
1000; SSE-LABEL: avx_vhadd_ps:
1001; SSE:       # BB#0:
1002; SSE-NEXT:    haddps %xmm2, %xmm0
1003; SSE-NEXT:    haddps %xmm3, %xmm1
1004; SSE-NEXT:    retq
1005;
1006; AVX-LABEL: avx_vhadd_ps:
1007; AVX:       # BB#0:
1008; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
1009; AVX-NEXT:    retq
1010  %vecext = extractelement <8 x float> %a, i32 0
1011  %vecext1 = extractelement <8 x float> %a, i32 1
1012  %add = fadd float %vecext, %vecext1
1013  %vecinit = insertelement <8 x float> undef, float %add, i32 0
1014  %vecext2 = extractelement <8 x float> %a, i32 2
1015  %vecext3 = extractelement <8 x float> %a, i32 3
1016  %add4 = fadd float %vecext2, %vecext3
1017  %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1
1018  %vecext6 = extractelement <8 x float> %b, i32 0
1019  %vecext7 = extractelement <8 x float> %b, i32 1
1020  %add8 = fadd float %vecext6, %vecext7
1021  %vecinit9 = insertelement <8 x float> %vecinit5, float %add8, i32 2
1022  %vecext10 = extractelement <8 x float> %b, i32 2
1023  %vecext11 = extractelement <8 x float> %b, i32 3
1024  %add12 = fadd float %vecext10, %vecext11
1025  %vecinit13 = insertelement <8 x float> %vecinit9, float %add12, i32 3
1026  %vecext14 = extractelement <8 x float> %a, i32 4
1027  %vecext15 = extractelement <8 x float> %a, i32 5
1028  %add16 = fadd float %vecext14, %vecext15
1029  %vecinit17 = insertelement <8 x float> %vecinit13, float %add16, i32 4
1030  %vecext18 = extractelement <8 x float> %a, i32 6
1031  %vecext19 = extractelement <8 x float> %a, i32 7
1032  %add20 = fadd float %vecext18, %vecext19
1033  %vecinit21 = insertelement <8 x float> %vecinit17, float %add20, i32 5
1034  %vecext22 = extractelement <8 x float> %b, i32 4
1035  %vecext23 = extractelement <8 x float> %b, i32 5
1036  %add24 = fadd float %vecext22, %vecext23
1037  %vecinit25 = insertelement <8 x float> %vecinit21, float %add24, i32 6
1038  %vecext26 = extractelement <8 x float> %b, i32 6
1039  %vecext27 = extractelement <8 x float> %b, i32 7
1040  %add28 = fadd float %vecext26, %vecext27
1041  %vecinit29 = insertelement <8 x float> %vecinit25, float %add28, i32 7
1042  ret <8 x float> %vecinit29
1043}
1044
1045define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) {
1046; SSE-LABEL: avx_vhsub_ps:
1047; SSE:       # BB#0:
1048; SSE-NEXT:    hsubps %xmm2, %xmm0
1049; SSE-NEXT:    hsubps %xmm3, %xmm1
1050; SSE-NEXT:    retq
1051;
1052; AVX-LABEL: avx_vhsub_ps:
1053; AVX:       # BB#0:
1054; AVX-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
1055; AVX-NEXT:    retq
1056  %vecext = extractelement <8 x float> %a, i32 0
1057  %vecext1 = extractelement <8 x float> %a, i32 1
1058  %sub = fsub float %vecext, %vecext1
1059  %vecinit = insertelement <8 x float> undef, float %sub, i32 0
1060  %vecext2 = extractelement <8 x float> %a, i32 2
1061  %vecext3 = extractelement <8 x float> %a, i32 3
1062  %sub4 = fsub float %vecext2, %vecext3
1063  %vecinit5 = insertelement <8 x float> %vecinit, float %sub4, i32 1
1064  %vecext6 = extractelement <8 x float> %b, i32 0
1065  %vecext7 = extractelement <8 x float> %b, i32 1
1066  %sub8 = fsub float %vecext6, %vecext7
1067  %vecinit9 = insertelement <8 x float> %vecinit5, float %sub8, i32 2
1068  %vecext10 = extractelement <8 x float> %b, i32 2
1069  %vecext11 = extractelement <8 x float> %b, i32 3
1070  %sub12 = fsub float %vecext10, %vecext11
1071  %vecinit13 = insertelement <8 x float> %vecinit9, float %sub12, i32 3
1072  %vecext14 = extractelement <8 x float> %a, i32 4
1073  %vecext15 = extractelement <8 x float> %a, i32 5
1074  %sub16 = fsub float %vecext14, %vecext15
1075  %vecinit17 = insertelement <8 x float> %vecinit13, float %sub16, i32 4
1076  %vecext18 = extractelement <8 x float> %a, i32 6
1077  %vecext19 = extractelement <8 x float> %a, i32 7
1078  %sub20 = fsub float %vecext18, %vecext19
1079  %vecinit21 = insertelement <8 x float> %vecinit17, float %sub20, i32 5
1080  %vecext22 = extractelement <8 x float> %b, i32 4
1081  %vecext23 = extractelement <8 x float> %b, i32 5
1082  %sub24 = fsub float %vecext22, %vecext23
1083  %vecinit25 = insertelement <8 x float> %vecinit21, float %sub24, i32 6
1084  %vecext26 = extractelement <8 x float> %b, i32 6
1085  %vecext27 = extractelement <8 x float> %b, i32 7
1086  %sub28 = fsub float %vecext26, %vecext27
1087  %vecinit29 = insertelement <8 x float> %vecinit25, float %sub28, i32 7
1088  ret <8 x float> %vecinit29
1089}
1090
1091define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) {
1092; SSE-LABEL: avx_hadd_pd:
1093; SSE:       # BB#0:
1094; SSE-NEXT:    haddpd %xmm2, %xmm0
1095; SSE-NEXT:    haddpd %xmm3, %xmm1
1096; SSE-NEXT:    retq
1097;
1098; AVX-LABEL: avx_hadd_pd:
1099; AVX:       # BB#0:
1100; AVX-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
1101; AVX-NEXT:    retq
1102  %vecext = extractelement <4 x double> %a, i32 0
1103  %vecext1 = extractelement <4 x double> %a, i32 1
1104  %add = fadd double %vecext, %vecext1
1105  %vecinit = insertelement <4 x double> undef, double %add, i32 0
1106  %vecext2 = extractelement <4 x double> %b, i32 0
1107  %vecext3 = extractelement <4 x double> %b, i32 1
1108  %add4 = fadd double %vecext2, %vecext3
1109  %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1
1110  %vecext6 = extractelement <4 x double> %a, i32 2
1111  %vecext7 = extractelement <4 x double> %a, i32 3
1112  %add8 = fadd double %vecext6, %vecext7
1113  %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2
1114  %vecext10 = extractelement <4 x double> %b, i32 2
1115  %vecext11 = extractelement <4 x double> %b, i32 3
1116  %add12 = fadd double %vecext10, %vecext11
1117  %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
1118  ret <4 x double> %vecinit13
1119}
1120
1121define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) {
1122; SSE-LABEL: avx_hsub_pd:
1123; SSE:       # BB#0:
1124; SSE-NEXT:    hsubpd %xmm2, %xmm0
1125; SSE-NEXT:    hsubpd %xmm3, %xmm1
1126; SSE-NEXT:    retq
1127;
1128; AVX-LABEL: avx_hsub_pd:
1129; AVX:       # BB#0:
1130; AVX-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
1131; AVX-NEXT:    retq
1132  %vecext = extractelement <4 x double> %a, i32 0
1133  %vecext1 = extractelement <4 x double> %a, i32 1
1134  %sub = fsub double %vecext, %vecext1
1135  %vecinit = insertelement <4 x double> undef, double %sub, i32 0
1136  %vecext2 = extractelement <4 x double> %b, i32 0
1137  %vecext3 = extractelement <4 x double> %b, i32 1
1138  %sub4 = fsub double %vecext2, %vecext3
1139  %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1
1140  %vecext6 = extractelement <4 x double> %a, i32 2
1141  %vecext7 = extractelement <4 x double> %a, i32 3
1142  %sub8 = fsub double %vecext6, %vecext7
1143  %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2
1144  %vecext10 = extractelement <4 x double> %b, i32 2
1145  %vecext11 = extractelement <4 x double> %b, i32 3
1146  %sub12 = fsub double %vecext10, %vecext11
1147  %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
1148  ret <4 x double> %vecinit13
1149}
1150
1151; Test AVX2 horizontal add of packed integer values from 256-bit vectors.
1152
1153define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
1154; SSE3-LABEL: avx2_hadd_d:
1155; SSE3:       # BB#0:
1156; SSE3-NEXT:    movd %xmm0, %ecx
1157; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
1158; SSE3-NEXT:    movd %xmm4, %r8d
1159; SSE3-NEXT:    addl %ecx, %r8d
1160; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
1161; SSE3-NEXT:    movd %xmm4, %edx
1162; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
1163; SSE3-NEXT:    movd %xmm0, %r9d
1164; SSE3-NEXT:    addl %edx, %r9d
1165; SSE3-NEXT:    movd %xmm2, %esi
1166; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
1167; SSE3-NEXT:    movd %xmm0, %r10d
1168; SSE3-NEXT:    addl %esi, %r10d
1169; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1170; SSE3-NEXT:    movd %xmm0, %esi
1171; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
1172; SSE3-NEXT:    movd %xmm0, %edi
1173; SSE3-NEXT:    addl %esi, %edi
1174; SSE3-NEXT:    movd %xmm1, %eax
1175; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1176; SSE3-NEXT:    movd %xmm0, %r11d
1177; SSE3-NEXT:    addl %eax, %r11d
1178; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1179; SSE3-NEXT:    movd %xmm0, %eax
1180; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
1181; SSE3-NEXT:    movd %xmm0, %ecx
1182; SSE3-NEXT:    addl %eax, %ecx
1183; SSE3-NEXT:    movd %xmm3, %eax
1184; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
1185; SSE3-NEXT:    movd %xmm0, %edx
1186; SSE3-NEXT:    addl %eax, %edx
1187; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
1188; SSE3-NEXT:    movd %xmm0, %eax
1189; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
1190; SSE3-NEXT:    movd %xmm0, %esi
1191; SSE3-NEXT:    addl %eax, %esi
1192; SSE3-NEXT:    movd %edi, %xmm0
1193; SSE3-NEXT:    movd %r9d, %xmm1
1194; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1195; SSE3-NEXT:    movd %r10d, %xmm2
1196; SSE3-NEXT:    movd %r8d, %xmm0
1197; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1198; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1199; SSE3-NEXT:    movd %esi, %xmm1
1200; SSE3-NEXT:    movd %ecx, %xmm2
1201; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1202; SSE3-NEXT:    movd %edx, %xmm3
1203; SSE3-NEXT:    movd %r11d, %xmm1
1204; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1205; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1206; SSE3-NEXT:    retq
1207;
1208; SSSE3-LABEL: avx2_hadd_d:
1209; SSSE3:       # BB#0:
1210; SSSE3-NEXT:    phaddd %xmm2, %xmm0
1211; SSSE3-NEXT:    phaddd %xmm3, %xmm1
1212; SSSE3-NEXT:    retq
1213;
1214; AVX1-LABEL: avx2_hadd_d:
1215; AVX1:       # BB#0:
1216; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1217; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1218; AVX1-NEXT:    vphaddd %xmm2, %xmm3, %xmm2
1219; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
1220; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1221; AVX1-NEXT:    retq
1222;
1223; AVX2-LABEL: avx2_hadd_d:
1224; AVX2:       # BB#0:
1225; AVX2-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
1226; AVX2-NEXT:    retq
1227  %vecext = extractelement <8 x i32> %a, i32 0
1228  %vecext1 = extractelement <8 x i32> %a, i32 1
1229  %add = add i32 %vecext, %vecext1
1230  %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
1231  %vecext2 = extractelement <8 x i32> %a, i32 2
1232  %vecext3 = extractelement <8 x i32> %a, i32 3
1233  %add4 = add i32 %vecext2, %vecext3
1234  %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
1235  %vecext6 = extractelement <8 x i32> %b, i32 0
1236  %vecext7 = extractelement <8 x i32> %b, i32 1
1237  %add8 = add i32 %vecext6, %vecext7
1238  %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2
1239  %vecext10 = extractelement <8 x i32> %b, i32 2
1240  %vecext11 = extractelement <8 x i32> %b, i32 3
1241  %add12 = add i32 %vecext10, %vecext11
1242  %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3
1243  %vecext14 = extractelement <8 x i32> %a, i32 4
1244  %vecext15 = extractelement <8 x i32> %a, i32 5
1245  %add16 = add i32 %vecext14, %vecext15
1246  %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4
1247  %vecext18 = extractelement <8 x i32> %a, i32 6
1248  %vecext19 = extractelement <8 x i32> %a, i32 7
1249  %add20 = add i32 %vecext18, %vecext19
1250  %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5
1251  %vecext22 = extractelement <8 x i32> %b, i32 4
1252  %vecext23 = extractelement <8 x i32> %b, i32 5
1253  %add24 = add i32 %vecext22, %vecext23
1254  %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6
1255  %vecext26 = extractelement <8 x i32> %b, i32 6
1256  %vecext27 = extractelement <8 x i32> %b, i32 7
1257  %add28 = add i32 %vecext26, %vecext27
1258  %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
1259  ret <8 x i32> %vecinit29
1260}
1261
1262define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) {
1263; SSE3-LABEL: avx2_hadd_w:
1264; SSE3:       # BB#0:
1265; SSE3-NEXT:    pushq %rbp
1266; SSE3-NEXT:  .Ltmp12:
1267; SSE3-NEXT:    .cfi_def_cfa_offset 16
1268; SSE3-NEXT:    pushq %r15
1269; SSE3-NEXT:  .Ltmp13:
1270; SSE3-NEXT:    .cfi_def_cfa_offset 24
1271; SSE3-NEXT:    pushq %r14
1272; SSE3-NEXT:  .Ltmp14:
1273; SSE3-NEXT:    .cfi_def_cfa_offset 32
1274; SSE3-NEXT:    pushq %r13
1275; SSE3-NEXT:  .Ltmp15:
1276; SSE3-NEXT:    .cfi_def_cfa_offset 40
1277; SSE3-NEXT:    pushq %r12
1278; SSE3-NEXT:  .Ltmp16:
1279; SSE3-NEXT:    .cfi_def_cfa_offset 48
1280; SSE3-NEXT:    pushq %rbx
1281; SSE3-NEXT:  .Ltmp17:
1282; SSE3-NEXT:    .cfi_def_cfa_offset 56
1283; SSE3-NEXT:  .Ltmp18:
1284; SSE3-NEXT:    .cfi_offset %rbx, -56
1285; SSE3-NEXT:  .Ltmp19:
1286; SSE3-NEXT:    .cfi_offset %r12, -48
1287; SSE3-NEXT:  .Ltmp20:
1288; SSE3-NEXT:    .cfi_offset %r13, -40
1289; SSE3-NEXT:  .Ltmp21:
1290; SSE3-NEXT:    .cfi_offset %r14, -32
1291; SSE3-NEXT:  .Ltmp22:
1292; SSE3-NEXT:    .cfi_offset %r15, -24
1293; SSE3-NEXT:  .Ltmp23:
1294; SSE3-NEXT:    .cfi_offset %rbp, -16
1295; SSE3-NEXT:    movd %xmm0, %eax
1296; SSE3-NEXT:    pextrw $1, %xmm0, %ecx
1297; SSE3-NEXT:    addl %eax, %ecx
1298; SSE3-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
1299; SSE3-NEXT:    pextrw $2, %xmm0, %eax
1300; SSE3-NEXT:    pextrw $3, %xmm0, %r15d
1301; SSE3-NEXT:    addl %eax, %r15d
1302; SSE3-NEXT:    pextrw $4, %xmm0, %eax
1303; SSE3-NEXT:    pextrw $5, %xmm0, %r14d
1304; SSE3-NEXT:    addl %eax, %r14d
1305; SSE3-NEXT:    pextrw $6, %xmm0, %eax
1306; SSE3-NEXT:    pextrw $7, %xmm0, %r13d
1307; SSE3-NEXT:    addl %eax, %r13d
1308; SSE3-NEXT:    movd %xmm1, %eax
1309; SSE3-NEXT:    pextrw $1, %xmm1, %ecx
1310; SSE3-NEXT:    addl %eax, %ecx
1311; SSE3-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
1312; SSE3-NEXT:    pextrw $2, %xmm1, %eax
1313; SSE3-NEXT:    pextrw $3, %xmm1, %r11d
1314; SSE3-NEXT:    addl %eax, %r11d
1315; SSE3-NEXT:    pextrw $4, %xmm1, %eax
1316; SSE3-NEXT:    pextrw $5, %xmm1, %r10d
1317; SSE3-NEXT:    addl %eax, %r10d
1318; SSE3-NEXT:    pextrw $6, %xmm1, %eax
1319; SSE3-NEXT:    pextrw $7, %xmm1, %r12d
1320; SSE3-NEXT:    addl %eax, %r12d
1321; SSE3-NEXT:    movd %xmm2, %eax
1322; SSE3-NEXT:    pextrw $1, %xmm2, %ebx
1323; SSE3-NEXT:    addl %eax, %ebx
1324; SSE3-NEXT:    pextrw $2, %xmm2, %eax
1325; SSE3-NEXT:    pextrw $3, %xmm2, %ecx
1326; SSE3-NEXT:    addl %eax, %ecx
1327; SSE3-NEXT:    pextrw $4, %xmm2, %esi
1328; SSE3-NEXT:    pextrw $5, %xmm2, %r8d
1329; SSE3-NEXT:    addl %esi, %r8d
1330; SSE3-NEXT:    pextrw $6, %xmm2, %esi
1331; SSE3-NEXT:    pextrw $7, %xmm2, %edx
1332; SSE3-NEXT:    addl %esi, %edx
1333; SSE3-NEXT:    movd %xmm3, %edi
1334; SSE3-NEXT:    pextrw $1, %xmm3, %r9d
1335; SSE3-NEXT:    addl %edi, %r9d
1336; SSE3-NEXT:    pextrw $2, %xmm3, %ebp
1337; SSE3-NEXT:    pextrw $3, %xmm3, %edi
1338; SSE3-NEXT:    addl %ebp, %edi
1339; SSE3-NEXT:    pextrw $4, %xmm3, %eax
1340; SSE3-NEXT:    pextrw $5, %xmm3, %ebp
1341; SSE3-NEXT:    addl %eax, %ebp
1342; SSE3-NEXT:    pextrw $6, %xmm3, %esi
1343; SSE3-NEXT:    pextrw $7, %xmm3, %eax
1344; SSE3-NEXT:    addl %esi, %eax
1345; SSE3-NEXT:    movd %edx, %xmm8
1346; SSE3-NEXT:    movd %r13d, %xmm3
1347; SSE3-NEXT:    movd %ecx, %xmm9
1348; SSE3-NEXT:    movd %r15d, %xmm4
1349; SSE3-NEXT:    movd %r8d, %xmm10
1350; SSE3-NEXT:    movd %r14d, %xmm7
1351; SSE3-NEXT:    movd %ebx, %xmm11
1352; SSE3-NEXT:    movd -{{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
1353; SSE3-NEXT:    # xmm0 = mem[0],zero,zero,zero
1354; SSE3-NEXT:    movd %eax, %xmm12
1355; SSE3-NEXT:    movd %r12d, %xmm6
1356; SSE3-NEXT:    movd %edi, %xmm13
1357; SSE3-NEXT:    movd %r11d, %xmm5
1358; SSE3-NEXT:    movd %ebp, %xmm14
1359; SSE3-NEXT:    movd %r10d, %xmm2
1360; SSE3-NEXT:    movd %r9d, %xmm15
1361; SSE3-NEXT:    movd -{{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload
1362; SSE3-NEXT:    # xmm1 = mem[0],zero,zero,zero
1363; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
1364; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
1365; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1366; SSE3-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
1367; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
1368; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
1369; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1370; SSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
1371; SSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
1372; SSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
1373; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
1374; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
1375; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1376; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
1377; SSE3-NEXT:    popq %rbx
1378; SSE3-NEXT:    popq %r12
1379; SSE3-NEXT:    popq %r13
1380; SSE3-NEXT:    popq %r14
1381; SSE3-NEXT:    popq %r15
1382; SSE3-NEXT:    popq %rbp
1383; SSE3-NEXT:    retq
1384;
1385; SSSE3-LABEL: avx2_hadd_w:
1386; SSSE3:       # BB#0:
1387; SSSE3-NEXT:    phaddw %xmm2, %xmm0
1388; SSSE3-NEXT:    phaddw %xmm3, %xmm1
1389; SSSE3-NEXT:    retq
1390;
1391; AVX1-LABEL: avx2_hadd_w:
1392; AVX1:       # BB#0:
1393; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1394; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1395; AVX1-NEXT:    vphaddw %xmm2, %xmm3, %xmm2
1396; AVX1-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
1397; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1398; AVX1-NEXT:    retq
1399;
1400; AVX2-LABEL: avx2_hadd_w:
1401; AVX2:       # BB#0:
1402; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
1403; AVX2-NEXT:    retq
1404  %vecext = extractelement <16 x i16> %a, i32 0
1405  %vecext1 = extractelement <16 x i16> %a, i32 1
1406  %add = add i16 %vecext, %vecext1
1407  %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0
1408  %vecext4 = extractelement <16 x i16> %a, i32 2
1409  %vecext6 = extractelement <16 x i16> %a, i32 3
1410  %add8 = add i16 %vecext4, %vecext6
1411  %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1
1412  %vecext11 = extractelement <16 x i16> %a, i32 4
1413  %vecext13 = extractelement <16 x i16> %a, i32 5
1414  %add15 = add i16 %vecext11, %vecext13
1415  %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2
1416  %vecext18 = extractelement <16 x i16> %a, i32 6
1417  %vecext20 = extractelement <16 x i16> %a, i32 7
1418  %add22 = add i16 %vecext18, %vecext20
1419  %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3
1420  %vecext25 = extractelement <16 x i16> %a, i32 8
1421  %vecext27 = extractelement <16 x i16> %a, i32 9
1422  %add29 = add i16 %vecext25, %vecext27
1423  %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 8
1424  %vecext32 = extractelement <16 x i16> %a, i32 10
1425  %vecext34 = extractelement <16 x i16> %a, i32 11
1426  %add36 = add i16 %vecext32, %vecext34
1427  %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 9
1428  %vecext39 = extractelement <16 x i16> %a, i32 12
1429  %vecext41 = extractelement <16 x i16> %a, i32 13
1430  %add43 = add i16 %vecext39, %vecext41
1431  %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 10
1432  %vecext46 = extractelement <16 x i16> %a, i32 14
1433  %vecext48 = extractelement <16 x i16> %a, i32 15
1434  %add50 = add i16 %vecext46, %vecext48
1435  %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 11
1436  %vecext53 = extractelement <16 x i16> %b, i32 0
1437  %vecext55 = extractelement <16 x i16> %b, i32 1
1438  %add57 = add i16 %vecext53, %vecext55
1439  %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 4
1440  %vecext60 = extractelement <16 x i16> %b, i32 2
1441  %vecext62 = extractelement <16 x i16> %b, i32 3
1442  %add64 = add i16 %vecext60, %vecext62
1443  %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 5
1444  %vecext67 = extractelement <16 x i16> %b, i32 4
1445  %vecext69 = extractelement <16 x i16> %b, i32 5
1446  %add71 = add i16 %vecext67, %vecext69
1447  %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 6
1448  %vecext74 = extractelement <16 x i16> %b, i32 6
1449  %vecext76 = extractelement <16 x i16> %b, i32 7
1450  %add78 = add i16 %vecext74, %vecext76
1451  %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 7
1452  %vecext81 = extractelement <16 x i16> %b, i32 8
1453  %vecext83 = extractelement <16 x i16> %b, i32 9
1454  %add85 = add i16 %vecext81, %vecext83
1455  %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12
1456  %vecext88 = extractelement <16 x i16> %b, i32 10
1457  %vecext90 = extractelement <16 x i16> %b, i32 11
1458  %add92 = add i16 %vecext88, %vecext90
1459  %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13
1460  %vecext95 = extractelement <16 x i16> %b, i32 12
1461  %vecext97 = extractelement <16 x i16> %b, i32 13
1462  %add99 = add i16 %vecext95, %vecext97
1463  %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14
1464  %vecext102 = extractelement <16 x i16> %b, i32 14
1465  %vecext104 = extractelement <16 x i16> %b, i32 15
1466  %add106 = add i16 %vecext102, %vecext104
1467  %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
1468  ret <16 x i16> %vecinit108
1469}
1470