1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSE3
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3,+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
6
7define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) {
8; SSE-LABEL: hadd_ps_test1:
9; SSE:       # %bb.0:
10; SSE-NEXT:    haddps %xmm1, %xmm0
11; SSE-NEXT:    retq
12;
13; AVX-LABEL: hadd_ps_test1:
14; AVX:       # %bb.0:
15; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
16; AVX-NEXT:    retq
17  %vecext = extractelement <4 x float> %A, i32 0
18  %vecext1 = extractelement <4 x float> %A, i32 1
19  %add = fadd float %vecext, %vecext1
20  %vecinit = insertelement <4 x float> undef, float %add, i32 0
21  %vecext2 = extractelement <4 x float> %A, i32 2
22  %vecext3 = extractelement <4 x float> %A, i32 3
23  %add4 = fadd float %vecext2, %vecext3
24  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
25  %vecext6 = extractelement <4 x float> %B, i32 0
26  %vecext7 = extractelement <4 x float> %B, i32 1
27  %add8 = fadd float %vecext6, %vecext7
28  %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
29  %vecext10 = extractelement <4 x float> %B, i32 2
30  %vecext11 = extractelement <4 x float> %B, i32 3
31  %add12 = fadd float %vecext10, %vecext11
32  %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
33  ret <4 x float> %vecinit13
34}
35
36define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) {
37; SSE-LABEL: hadd_ps_test2:
38; SSE:       # %bb.0:
39; SSE-NEXT:    haddps %xmm1, %xmm0
40; SSE-NEXT:    retq
41;
42; AVX-LABEL: hadd_ps_test2:
43; AVX:       # %bb.0:
44; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
45; AVX-NEXT:    retq
46  %vecext = extractelement <4 x float> %A, i32 2
47  %vecext1 = extractelement <4 x float> %A, i32 3
48  %add = fadd float %vecext, %vecext1
49  %vecinit = insertelement <4 x float> undef, float %add, i32 1
50  %vecext2 = extractelement <4 x float> %A, i32 0
51  %vecext3 = extractelement <4 x float> %A, i32 1
52  %add4 = fadd float %vecext2, %vecext3
53  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 0
54  %vecext6 = extractelement <4 x float> %B, i32 2
55  %vecext7 = extractelement <4 x float> %B, i32 3
56  %add8 = fadd float %vecext6, %vecext7
57  %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 3
58  %vecext10 = extractelement <4 x float> %B, i32 0
59  %vecext11 = extractelement <4 x float> %B, i32 1
60  %add12 = fadd float %vecext10, %vecext11
61  %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 2
62  ret <4 x float> %vecinit13
63}
64
65define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) {
66; SSE-LABEL: hsub_ps_test1:
67; SSE:       # %bb.0:
68; SSE-NEXT:    hsubps %xmm1, %xmm0
69; SSE-NEXT:    retq
70;
71; AVX-LABEL: hsub_ps_test1:
72; AVX:       # %bb.0:
73; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
74; AVX-NEXT:    retq
75  %vecext = extractelement <4 x float> %A, i32 0
76  %vecext1 = extractelement <4 x float> %A, i32 1
77  %sub = fsub float %vecext, %vecext1
78  %vecinit = insertelement <4 x float> undef, float %sub, i32 0
79  %vecext2 = extractelement <4 x float> %A, i32 2
80  %vecext3 = extractelement <4 x float> %A, i32 3
81  %sub4 = fsub float %vecext2, %vecext3
82  %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 1
83  %vecext6 = extractelement <4 x float> %B, i32 0
84  %vecext7 = extractelement <4 x float> %B, i32 1
85  %sub8 = fsub float %vecext6, %vecext7
86  %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 2
87  %vecext10 = extractelement <4 x float> %B, i32 2
88  %vecext11 = extractelement <4 x float> %B, i32 3
89  %sub12 = fsub float %vecext10, %vecext11
90  %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 3
91  ret <4 x float> %vecinit13
92}
93
94define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) {
95; SSE-LABEL: hsub_ps_test2:
96; SSE:       # %bb.0:
97; SSE-NEXT:    hsubps %xmm1, %xmm0
98; SSE-NEXT:    retq
99;
100; AVX-LABEL: hsub_ps_test2:
101; AVX:       # %bb.0:
102; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
103; AVX-NEXT:    retq
104  %vecext = extractelement <4 x float> %A, i32 2
105  %vecext1 = extractelement <4 x float> %A, i32 3
106  %sub = fsub float %vecext, %vecext1
107  %vecinit = insertelement <4 x float> undef, float %sub, i32 1
108  %vecext2 = extractelement <4 x float> %A, i32 0
109  %vecext3 = extractelement <4 x float> %A, i32 1
110  %sub4 = fsub float %vecext2, %vecext3
111  %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0
112  %vecext6 = extractelement <4 x float> %B, i32 2
113  %vecext7 = extractelement <4 x float> %B, i32 3
114  %sub8 = fsub float %vecext6, %vecext7
115  %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3
116  %vecext10 = extractelement <4 x float> %B, i32 0
117  %vecext11 = extractelement <4 x float> %B, i32 1
118  %sub12 = fsub float %vecext10, %vecext11
119  %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
120  ret <4 x float> %vecinit13
121}
122
123define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
124; SSE3-LABEL: phadd_d_test1:
125; SSE3:       # %bb.0:
126; SSE3-NEXT:    movd %xmm0, %eax
127; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
128; SSE3-NEXT:    movd %xmm2, %ecx
129; SSE3-NEXT:    addl %eax, %ecx
130; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
131; SSE3-NEXT:    movd %xmm2, %eax
132; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
133; SSE3-NEXT:    movd %xmm0, %edx
134; SSE3-NEXT:    addl %eax, %edx
135; SSE3-NEXT:    movd %xmm1, %eax
136; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
137; SSE3-NEXT:    movd %xmm0, %esi
138; SSE3-NEXT:    addl %eax, %esi
139; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
140; SSE3-NEXT:    movd %xmm0, %eax
141; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
142; SSE3-NEXT:    movd %xmm0, %edi
143; SSE3-NEXT:    addl %eax, %edi
144; SSE3-NEXT:    movd %edi, %xmm0
145; SSE3-NEXT:    movd %esi, %xmm1
146; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
147; SSE3-NEXT:    movd %edx, %xmm2
148; SSE3-NEXT:    movd %ecx, %xmm0
149; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
150; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
151; SSE3-NEXT:    retq
152;
153; SSSE3-LABEL: phadd_d_test1:
154; SSSE3:       # %bb.0:
155; SSSE3-NEXT:    phaddd %xmm1, %xmm0
156; SSSE3-NEXT:    retq
157;
158; AVX-LABEL: phadd_d_test1:
159; AVX:       # %bb.0:
160; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
161; AVX-NEXT:    retq
162  %vecext = extractelement <4 x i32> %A, i32 0
163  %vecext1 = extractelement <4 x i32> %A, i32 1
164  %add = add i32 %vecext, %vecext1
165  %vecinit = insertelement <4 x i32> undef, i32 %add, i32 0
166  %vecext2 = extractelement <4 x i32> %A, i32 2
167  %vecext3 = extractelement <4 x i32> %A, i32 3
168  %add4 = add i32 %vecext2, %vecext3
169  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 1
170  %vecext6 = extractelement <4 x i32> %B, i32 0
171  %vecext7 = extractelement <4 x i32> %B, i32 1
172  %add8 = add i32 %vecext6, %vecext7
173  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 2
174  %vecext10 = extractelement <4 x i32> %B, i32 2
175  %vecext11 = extractelement <4 x i32> %B, i32 3
176  %add12 = add i32 %vecext10, %vecext11
177  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 3
178  ret <4 x i32> %vecinit13
179}
180
181define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
182; SSE3-LABEL: phadd_d_test2:
183; SSE3:       # %bb.0:
184; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
185; SSE3-NEXT:    movd %xmm2, %eax
186; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
187; SSE3-NEXT:    movd %xmm2, %ecx
188; SSE3-NEXT:    addl %eax, %ecx
189; SSE3-NEXT:    movd %xmm0, %eax
190; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
191; SSE3-NEXT:    movd %xmm0, %edx
192; SSE3-NEXT:    addl %eax, %edx
193; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
194; SSE3-NEXT:    movd %xmm0, %eax
195; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
196; SSE3-NEXT:    movd %xmm0, %esi
197; SSE3-NEXT:    addl %eax, %esi
198; SSE3-NEXT:    movd %esi, %xmm0
199; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
200; SSE3-NEXT:    movd %xmm2, %eax
201; SSE3-NEXT:    movd %xmm1, %esi
202; SSE3-NEXT:    addl %eax, %esi
203; SSE3-NEXT:    movd %esi, %xmm1
204; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
205; SSE3-NEXT:    movd %ecx, %xmm2
206; SSE3-NEXT:    movd %edx, %xmm0
207; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
208; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
209; SSE3-NEXT:    retq
210;
211; SSSE3-LABEL: phadd_d_test2:
212; SSSE3:       # %bb.0:
213; SSSE3-NEXT:    phaddd %xmm1, %xmm0
214; SSSE3-NEXT:    retq
215;
216; AVX-LABEL: phadd_d_test2:
217; AVX:       # %bb.0:
218; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
219; AVX-NEXT:    retq
220  %vecext = extractelement <4 x i32> %A, i32 2
221  %vecext1 = extractelement <4 x i32> %A, i32 3
222  %add = add i32 %vecext, %vecext1
223  %vecinit = insertelement <4 x i32> undef, i32 %add, i32 1
224  %vecext2 = extractelement <4 x i32> %A, i32 0
225  %vecext3 = extractelement <4 x i32> %A, i32 1
226  %add4 = add i32 %vecext2, %vecext3
227  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 0
228  %vecext6 = extractelement <4 x i32> %B, i32 3
229  %vecext7 = extractelement <4 x i32> %B, i32 2
230  %add8 = add i32 %vecext6, %vecext7
231  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 3
232  %vecext10 = extractelement <4 x i32> %B, i32 1
233  %vecext11 = extractelement <4 x i32> %B, i32 0
234  %add12 = add i32 %vecext10, %vecext11
235  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 2
236  ret <4 x i32> %vecinit13
237}
238
239define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
240; SSE3-LABEL: phsub_d_test1:
241; SSE3:       # %bb.0:
242; SSE3-NEXT:    movd %xmm0, %eax
243; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
244; SSE3-NEXT:    movd %xmm2, %ecx
245; SSE3-NEXT:    subl %ecx, %eax
246; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
247; SSE3-NEXT:    movd %xmm2, %ecx
248; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
249; SSE3-NEXT:    movd %xmm0, %edx
250; SSE3-NEXT:    subl %edx, %ecx
251; SSE3-NEXT:    movd %xmm1, %edx
252; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
253; SSE3-NEXT:    movd %xmm0, %esi
254; SSE3-NEXT:    subl %esi, %edx
255; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
256; SSE3-NEXT:    movd %xmm0, %esi
257; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
258; SSE3-NEXT:    movd %xmm0, %edi
259; SSE3-NEXT:    subl %edi, %esi
260; SSE3-NEXT:    movd %esi, %xmm0
261; SSE3-NEXT:    movd %edx, %xmm1
262; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
263; SSE3-NEXT:    movd %ecx, %xmm2
264; SSE3-NEXT:    movd %eax, %xmm0
265; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
266; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
267; SSE3-NEXT:    retq
268;
269; SSSE3-LABEL: phsub_d_test1:
270; SSSE3:       # %bb.0:
271; SSSE3-NEXT:    phsubd %xmm1, %xmm0
272; SSSE3-NEXT:    retq
273;
274; AVX-LABEL: phsub_d_test1:
275; AVX:       # %bb.0:
276; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
277; AVX-NEXT:    retq
278  %vecext = extractelement <4 x i32> %A, i32 0
279  %vecext1 = extractelement <4 x i32> %A, i32 1
280  %sub = sub i32 %vecext, %vecext1
281  %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0
282  %vecext2 = extractelement <4 x i32> %A, i32 2
283  %vecext3 = extractelement <4 x i32> %A, i32 3
284  %sub4 = sub i32 %vecext2, %vecext3
285  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1
286  %vecext6 = extractelement <4 x i32> %B, i32 0
287  %vecext7 = extractelement <4 x i32> %B, i32 1
288  %sub8 = sub i32 %vecext6, %vecext7
289  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2
290  %vecext10 = extractelement <4 x i32> %B, i32 2
291  %vecext11 = extractelement <4 x i32> %B, i32 3
292  %sub12 = sub i32 %vecext10, %vecext11
293  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
294  ret <4 x i32> %vecinit13
295}
296
297define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
298; SSE3-LABEL: phsub_d_test2:
299; SSE3:       # %bb.0:
300; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
301; SSE3-NEXT:    movd %xmm2, %eax
302; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
303; SSE3-NEXT:    movd %xmm2, %ecx
304; SSE3-NEXT:    subl %ecx, %eax
305; SSE3-NEXT:    movd %xmm0, %ecx
306; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
307; SSE3-NEXT:    movd %xmm0, %edx
308; SSE3-NEXT:    subl %edx, %ecx
309; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
310; SSE3-NEXT:    movd %xmm0, %edx
311; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
312; SSE3-NEXT:    movd %xmm0, %esi
313; SSE3-NEXT:    subl %esi, %edx
314; SSE3-NEXT:    movd %edx, %xmm0
315; SSE3-NEXT:    movd %xmm1, %edx
316; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
317; SSE3-NEXT:    movd %xmm1, %esi
318; SSE3-NEXT:    subl %esi, %edx
319; SSE3-NEXT:    movd %edx, %xmm1
320; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
321; SSE3-NEXT:    movd %eax, %xmm2
322; SSE3-NEXT:    movd %ecx, %xmm0
323; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
324; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
325; SSE3-NEXT:    retq
326;
327; SSSE3-LABEL: phsub_d_test2:
328; SSSE3:       # %bb.0:
329; SSSE3-NEXT:    phsubd %xmm1, %xmm0
330; SSSE3-NEXT:    retq
331;
332; AVX-LABEL: phsub_d_test2:
333; AVX:       # %bb.0:
334; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
335; AVX-NEXT:    retq
336  %vecext = extractelement <4 x i32> %A, i32 2
337  %vecext1 = extractelement <4 x i32> %A, i32 3
338  %sub = sub i32 %vecext, %vecext1
339  %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 1
340  %vecext2 = extractelement <4 x i32> %A, i32 0
341  %vecext3 = extractelement <4 x i32> %A, i32 1
342  %sub4 = sub i32 %vecext2, %vecext3
343  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 0
344  %vecext6 = extractelement <4 x i32> %B, i32 2
345  %vecext7 = extractelement <4 x i32> %B, i32 3
346  %sub8 = sub i32 %vecext6, %vecext7
347  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 3
348  %vecext10 = extractelement <4 x i32> %B, i32 0
349  %vecext11 = extractelement <4 x i32> %B, i32 1
350  %sub12 = sub i32 %vecext10, %vecext11
351  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 2
352  ret <4 x i32> %vecinit13
353}
354
355define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) {
356; SSE-LABEL: hadd_pd_test1:
357; SSE:       # %bb.0:
358; SSE-NEXT:    haddpd %xmm1, %xmm0
359; SSE-NEXT:    retq
360;
361; AVX-LABEL: hadd_pd_test1:
362; AVX:       # %bb.0:
363; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
364; AVX-NEXT:    retq
365  %vecext = extractelement <2 x double> %A, i32 0
366  %vecext1 = extractelement <2 x double> %A, i32 1
367  %add = fadd double %vecext, %vecext1
368  %vecinit = insertelement <2 x double> undef, double %add, i32 0
369  %vecext2 = extractelement <2 x double> %B, i32 0
370  %vecext3 = extractelement <2 x double> %B, i32 1
371  %add2 = fadd double %vecext2, %vecext3
372  %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
373  ret <2 x double> %vecinit2
374}
375
376define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) {
377; SSE-LABEL: hadd_pd_test2:
378; SSE:       # %bb.0:
379; SSE-NEXT:    haddpd %xmm1, %xmm0
380; SSE-NEXT:    retq
381;
382; AVX-LABEL: hadd_pd_test2:
383; AVX:       # %bb.0:
384; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
385; AVX-NEXT:    retq
386  %vecext = extractelement <2 x double> %A, i32 1
387  %vecext1 = extractelement <2 x double> %A, i32 0
388  %add = fadd double %vecext, %vecext1
389  %vecinit = insertelement <2 x double> undef, double %add, i32 0
390  %vecext2 = extractelement <2 x double> %B, i32 1
391  %vecext3 = extractelement <2 x double> %B, i32 0
392  %add2 = fadd double %vecext2, %vecext3
393  %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
394  ret <2 x double> %vecinit2
395}
396
397define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) {
398; SSE-LABEL: hsub_pd_test1:
399; SSE:       # %bb.0:
400; SSE-NEXT:    hsubpd %xmm1, %xmm0
401; SSE-NEXT:    retq
402;
403; AVX-LABEL: hsub_pd_test1:
404; AVX:       # %bb.0:
405; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
406; AVX-NEXT:    retq
407  %vecext = extractelement <2 x double> %A, i32 0
408  %vecext1 = extractelement <2 x double> %A, i32 1
409  %sub = fsub double %vecext, %vecext1
410  %vecinit = insertelement <2 x double> undef, double %sub, i32 0
411  %vecext2 = extractelement <2 x double> %B, i32 0
412  %vecext3 = extractelement <2 x double> %B, i32 1
413  %sub2 = fsub double %vecext2, %vecext3
414  %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 1
415  ret <2 x double> %vecinit2
416}
417
418define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) {
419; SSE-LABEL: hsub_pd_test2:
420; SSE:       # %bb.0:
421; SSE-NEXT:    hsubpd %xmm1, %xmm0
422; SSE-NEXT:    retq
423;
424; AVX-LABEL: hsub_pd_test2:
425; AVX:       # %bb.0:
426; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
427; AVX-NEXT:    retq
428  %vecext = extractelement <2 x double> %B, i32 0
429  %vecext1 = extractelement <2 x double> %B, i32 1
430  %sub = fsub double %vecext, %vecext1
431  %vecinit = insertelement <2 x double> undef, double %sub, i32 1
432  %vecext2 = extractelement <2 x double> %A, i32 0
433  %vecext3 = extractelement <2 x double> %A, i32 1
434  %sub2 = fsub double %vecext2, %vecext3
435  %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
436  ret <2 x double> %vecinit2
437}
438
439define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) {
440; SSE-LABEL: avx_vhadd_pd_test:
441; SSE:       # %bb.0:
442; SSE-NEXT:    haddpd %xmm1, %xmm0
443; SSE-NEXT:    haddpd %xmm3, %xmm2
444; SSE-NEXT:    movapd %xmm2, %xmm1
445; SSE-NEXT:    retq
446;
447; AVX-LABEL: avx_vhadd_pd_test:
448; AVX:       # %bb.0:
449; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
450; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
451; AVX-NEXT:    vhaddpd %ymm2, %ymm0, %ymm0
452; AVX-NEXT:    retq
453  %vecext = extractelement <4 x double> %A, i32 0
454  %vecext1 = extractelement <4 x double> %A, i32 1
455  %add = fadd double %vecext, %vecext1
456  %vecinit = insertelement <4 x double> undef, double %add, i32 0
457  %vecext2 = extractelement <4 x double> %A, i32 2
458  %vecext3 = extractelement <4 x double> %A, i32 3
459  %add4 = fadd double %vecext2, %vecext3
460  %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1
461  %vecext6 = extractelement <4 x double> %B, i32 0
462  %vecext7 = extractelement <4 x double> %B, i32 1
463  %add8 = fadd double %vecext6, %vecext7
464  %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2
465  %vecext10 = extractelement <4 x double> %B, i32 2
466  %vecext11 = extractelement <4 x double> %B, i32 3
467  %add12 = fadd double %vecext10, %vecext11
468  %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
469  ret <4 x double> %vecinit13
470}
471
472define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) {
473; SSE-LABEL: avx_vhsub_pd_test:
474; SSE:       # %bb.0:
475; SSE-NEXT:    hsubpd %xmm1, %xmm0
476; SSE-NEXT:    hsubpd %xmm3, %xmm2
477; SSE-NEXT:    movapd %xmm2, %xmm1
478; SSE-NEXT:    retq
479;
480; AVX-LABEL: avx_vhsub_pd_test:
481; AVX:       # %bb.0:
482; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
483; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
484; AVX-NEXT:    vhsubpd %ymm2, %ymm0, %ymm0
485; AVX-NEXT:    retq
486  %vecext = extractelement <4 x double> %A, i32 0
487  %vecext1 = extractelement <4 x double> %A, i32 1
488  %sub = fsub double %vecext, %vecext1
489  %vecinit = insertelement <4 x double> undef, double %sub, i32 0
490  %vecext2 = extractelement <4 x double> %A, i32 2
491  %vecext3 = extractelement <4 x double> %A, i32 3
492  %sub4 = fsub double %vecext2, %vecext3
493  %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1
494  %vecext6 = extractelement <4 x double> %B, i32 0
495  %vecext7 = extractelement <4 x double> %B, i32 1
496  %sub8 = fsub double %vecext6, %vecext7
497  %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2
498  %vecext10 = extractelement <4 x double> %B, i32 2
499  %vecext11 = extractelement <4 x double> %B, i32 3
500  %sub12 = fsub double %vecext10, %vecext11
501  %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
502  ret <4 x double> %vecinit13
503}
504
505define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
506; SSE3-LABEL: avx2_vphadd_d_test:
507; SSE3:       # %bb.0:
508; SSE3-NEXT:    movd %xmm0, %ecx
509; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
510; SSE3-NEXT:    movd %xmm4, %r8d
511; SSE3-NEXT:    addl %ecx, %r8d
512; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
513; SSE3-NEXT:    movd %xmm4, %edx
514; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
515; SSE3-NEXT:    movd %xmm0, %r9d
516; SSE3-NEXT:    addl %edx, %r9d
517; SSE3-NEXT:    movd %xmm1, %edx
518; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
519; SSE3-NEXT:    movd %xmm0, %esi
520; SSE3-NEXT:    addl %edx, %esi
521; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
522; SSE3-NEXT:    movd %xmm0, %edx
523; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
524; SSE3-NEXT:    movd %xmm0, %edi
525; SSE3-NEXT:    addl %edx, %edi
526; SSE3-NEXT:    movd %xmm2, %eax
527; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
528; SSE3-NEXT:    movd %xmm0, %r10d
529; SSE3-NEXT:    addl %eax, %r10d
530; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
531; SSE3-NEXT:    movd %xmm0, %eax
532; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3]
533; SSE3-NEXT:    movd %xmm0, %ecx
534; SSE3-NEXT:    addl %eax, %ecx
535; SSE3-NEXT:    movd %xmm3, %eax
536; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
537; SSE3-NEXT:    movd %xmm0, %edx
538; SSE3-NEXT:    addl %eax, %edx
539; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
540; SSE3-NEXT:    movd %xmm0, %r11d
541; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3]
542; SSE3-NEXT:    movd %xmm0, %eax
543; SSE3-NEXT:    addl %r11d, %eax
544; SSE3-NEXT:    movd %edi, %xmm0
545; SSE3-NEXT:    movd %esi, %xmm1
546; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
547; SSE3-NEXT:    movd %r9d, %xmm2
548; SSE3-NEXT:    movd %r8d, %xmm0
549; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
550; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
551; SSE3-NEXT:    movd %eax, %xmm1
552; SSE3-NEXT:    movd %edx, %xmm2
553; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
554; SSE3-NEXT:    movd %ecx, %xmm3
555; SSE3-NEXT:    movd %r10d, %xmm1
556; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
557; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
558; SSE3-NEXT:    retq
559;
560; SSSE3-LABEL: avx2_vphadd_d_test:
561; SSSE3:       # %bb.0:
562; SSSE3-NEXT:    phaddd %xmm1, %xmm0
563; SSSE3-NEXT:    phaddd %xmm3, %xmm2
564; SSSE3-NEXT:    movdqa %xmm2, %xmm1
565; SSSE3-NEXT:    retq
566;
567; AVX1-LABEL: avx2_vphadd_d_test:
568; AVX1:       # %bb.0:
569; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
570; AVX1-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
571; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
572; AVX1-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
573; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
574; AVX1-NEXT:    retq
575;
576; AVX2-LABEL: avx2_vphadd_d_test:
577; AVX2:       # %bb.0:
578; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
579; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
580; AVX2-NEXT:    vphaddd %ymm2, %ymm0, %ymm0
581; AVX2-NEXT:    retq
582  %vecext = extractelement <8 x i32> %A, i32 0
583  %vecext1 = extractelement <8 x i32> %A, i32 1
584  %add = add i32 %vecext, %vecext1
585  %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
586  %vecext2 = extractelement <8 x i32> %A, i32 2
587  %vecext3 = extractelement <8 x i32> %A, i32 3
588  %add4 = add i32 %vecext2, %vecext3
589  %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
590  %vecext6 = extractelement <8 x i32> %A, i32 4
591  %vecext7 = extractelement <8 x i32> %A, i32 5
592  %add8 = add i32 %vecext6, %vecext7
593  %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2
594  %vecext10 = extractelement <8 x i32> %A, i32 6
595  %vecext11 = extractelement <8 x i32> %A, i32 7
596  %add12 = add i32 %vecext10, %vecext11
597  %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3
598  %vecext14 = extractelement <8 x i32> %B, i32 0
599  %vecext15 = extractelement <8 x i32> %B, i32 1
600  %add16 = add i32 %vecext14, %vecext15
601  %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4
602  %vecext18 = extractelement <8 x i32> %B, i32 2
603  %vecext19 = extractelement <8 x i32> %B, i32 3
604  %add20 = add i32 %vecext18, %vecext19
605  %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5
606  %vecext22 = extractelement <8 x i32> %B, i32 4
607  %vecext23 = extractelement <8 x i32> %B, i32 5
608  %add24 = add i32 %vecext22, %vecext23
609  %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6
610  %vecext26 = extractelement <8 x i32> %B, i32 6
611  %vecext27 = extractelement <8 x i32> %B, i32 7
612  %add28 = add i32 %vecext26, %vecext27
613  %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
614  ret <8 x i32> %vecinit29
615}
616
617define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) nounwind {
618; SSE3-LABEL: avx2_vphadd_w_test:
619; SSE3:       # %bb.0:
620; SSE3-NEXT:    pushq %rbp
621; SSE3-NEXT:    pushq %r15
622; SSE3-NEXT:    pushq %r14
623; SSE3-NEXT:    pushq %r13
624; SSE3-NEXT:    pushq %r12
625; SSE3-NEXT:    pushq %rbx
626; SSE3-NEXT:    movd %xmm0, %eax
627; SSE3-NEXT:    pextrw $1, %xmm0, %ecx
628; SSE3-NEXT:    addl %eax, %ecx
629; SSE3-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
630; SSE3-NEXT:    pextrw $2, %xmm0, %eax
631; SSE3-NEXT:    pextrw $3, %xmm0, %ecx
632; SSE3-NEXT:    addl %eax, %ecx
633; SSE3-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
634; SSE3-NEXT:    pextrw $4, %xmm0, %eax
635; SSE3-NEXT:    pextrw $5, %xmm0, %r11d
636; SSE3-NEXT:    addl %eax, %r11d
637; SSE3-NEXT:    pextrw $6, %xmm0, %eax
638; SSE3-NEXT:    pextrw $7, %xmm0, %r15d
639; SSE3-NEXT:    addl %eax, %r15d
640; SSE3-NEXT:    movd %xmm1, %eax
641; SSE3-NEXT:    pextrw $1, %xmm1, %r13d
642; SSE3-NEXT:    addl %eax, %r13d
643; SSE3-NEXT:    pextrw $2, %xmm1, %eax
644; SSE3-NEXT:    pextrw $3, %xmm1, %ebx
645; SSE3-NEXT:    addl %eax, %ebx
646; SSE3-NEXT:    pextrw $4, %xmm1, %eax
647; SSE3-NEXT:    pextrw $5, %xmm1, %r8d
648; SSE3-NEXT:    addl %eax, %r8d
649; SSE3-NEXT:    pextrw $6, %xmm1, %eax
650; SSE3-NEXT:    pextrw $7, %xmm1, %esi
651; SSE3-NEXT:    addl %eax, %esi
652; SSE3-NEXT:    movd %xmm2, %eax
653; SSE3-NEXT:    pextrw $1, %xmm2, %r10d
654; SSE3-NEXT:    addl %eax, %r10d
655; SSE3-NEXT:    pextrw $2, %xmm2, %eax
656; SSE3-NEXT:    pextrw $3, %xmm2, %r14d
657; SSE3-NEXT:    addl %eax, %r14d
658; SSE3-NEXT:    pextrw $4, %xmm2, %eax
659; SSE3-NEXT:    pextrw $5, %xmm2, %r12d
660; SSE3-NEXT:    addl %eax, %r12d
661; SSE3-NEXT:    pextrw $6, %xmm2, %eax
662; SSE3-NEXT:    pextrw $7, %xmm2, %r9d
663; SSE3-NEXT:    addl %eax, %r9d
664; SSE3-NEXT:    movd %xmm3, %eax
665; SSE3-NEXT:    pextrw $1, %xmm3, %ebp
666; SSE3-NEXT:    addl %eax, %ebp
667; SSE3-NEXT:    pextrw $2, %xmm3, %edx
668; SSE3-NEXT:    pextrw $3, %xmm3, %edi
669; SSE3-NEXT:    addl %edx, %edi
670; SSE3-NEXT:    pextrw $4, %xmm3, %edx
671; SSE3-NEXT:    pextrw $5, %xmm3, %ecx
672; SSE3-NEXT:    addl %edx, %ecx
673; SSE3-NEXT:    pextrw $6, %xmm3, %edx
674; SSE3-NEXT:    pextrw $7, %xmm3, %eax
675; SSE3-NEXT:    addl %edx, %eax
676; SSE3-NEXT:    movd %esi, %xmm8
677; SSE3-NEXT:    movd %r8d, %xmm3
678; SSE3-NEXT:    movd %ebx, %xmm9
679; SSE3-NEXT:    movd %r13d, %xmm4
680; SSE3-NEXT:    movd %r15d, %xmm10
681; SSE3-NEXT:    movd %r11d, %xmm7
682; SSE3-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload
683; SSE3-NEXT:    # xmm11 = mem[0],zero,zero,zero
684; SSE3-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
685; SSE3-NEXT:    # xmm0 = mem[0],zero,zero,zero
686; SSE3-NEXT:    movd %eax, %xmm12
687; SSE3-NEXT:    movd %ecx, %xmm6
688; SSE3-NEXT:    movd %edi, %xmm13
689; SSE3-NEXT:    movd %ebp, %xmm5
690; SSE3-NEXT:    movd %r9d, %xmm14
691; SSE3-NEXT:    movd %r12d, %xmm2
692; SSE3-NEXT:    movd %r14d, %xmm15
693; SSE3-NEXT:    movd %r10d, %xmm1
694; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
695; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
696; SSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
697; SSE3-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
698; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
699; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
700; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
701; SSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
702; SSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
703; SSE3-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
704; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
705; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
706; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
707; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
708; SSE3-NEXT:    popq %rbx
709; SSE3-NEXT:    popq %r12
710; SSE3-NEXT:    popq %r13
711; SSE3-NEXT:    popq %r14
712; SSE3-NEXT:    popq %r15
713; SSE3-NEXT:    popq %rbp
714; SSE3-NEXT:    retq
715;
716; SSSE3-LABEL: avx2_vphadd_w_test:
717; SSSE3:       # %bb.0:
718; SSSE3-NEXT:    phaddw %xmm1, %xmm0
719; SSSE3-NEXT:    phaddw %xmm3, %xmm2
720; SSSE3-NEXT:    movdqa %xmm2, %xmm1
721; SSSE3-NEXT:    retq
722;
723; AVX1-LABEL: avx2_vphadd_w_test:
724; AVX1:       # %bb.0:
725; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
726; AVX1-NEXT:    vphaddw %xmm2, %xmm1, %xmm1
727; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
728; AVX1-NEXT:    vphaddw %xmm2, %xmm0, %xmm0
729; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
730; AVX1-NEXT:    retq
731;
732; AVX2-LABEL: avx2_vphadd_w_test:
733; AVX2:       # %bb.0:
734; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
735; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
736; AVX2-NEXT:    vphaddw %ymm2, %ymm0, %ymm0
737; AVX2-NEXT:    retq
738  %vecext = extractelement <16 x i16> %a, i32 0
739  %vecext1 = extractelement <16 x i16> %a, i32 1
740  %add = add i16 %vecext, %vecext1
741  %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0
742  %vecext4 = extractelement <16 x i16> %a, i32 2
743  %vecext6 = extractelement <16 x i16> %a, i32 3
744  %add8 = add i16 %vecext4, %vecext6
745  %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1
746  %vecext11 = extractelement <16 x i16> %a, i32 4
747  %vecext13 = extractelement <16 x i16> %a, i32 5
748  %add15 = add i16 %vecext11, %vecext13
749  %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2
750  %vecext18 = extractelement <16 x i16> %a, i32 6
751  %vecext20 = extractelement <16 x i16> %a, i32 7
752  %add22 = add i16 %vecext18, %vecext20
753  %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3
754  %vecext25 = extractelement <16 x i16> %a, i32 8
755  %vecext27 = extractelement <16 x i16> %a, i32 9
756  %add29 = add i16 %vecext25, %vecext27
757  %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 4
758  %vecext32 = extractelement <16 x i16> %a, i32 10
759  %vecext34 = extractelement <16 x i16> %a, i32 11
760  %add36 = add i16 %vecext32, %vecext34
761  %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 5
762  %vecext39 = extractelement <16 x i16> %a, i32 12
763  %vecext41 = extractelement <16 x i16> %a, i32 13
764  %add43 = add i16 %vecext39, %vecext41
765  %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 6
766  %vecext46 = extractelement <16 x i16> %a, i32 14
767  %vecext48 = extractelement <16 x i16> %a, i32 15
768  %add50 = add i16 %vecext46, %vecext48
769  %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 7
770  %vecext53 = extractelement <16 x i16> %b, i32 0
771  %vecext55 = extractelement <16 x i16> %b, i32 1
772  %add57 = add i16 %vecext53, %vecext55
773  %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 8
774  %vecext60 = extractelement <16 x i16> %b, i32 2
775  %vecext62 = extractelement <16 x i16> %b, i32 3
776  %add64 = add i16 %vecext60, %vecext62
777  %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 9
778  %vecext67 = extractelement <16 x i16> %b, i32 4
779  %vecext69 = extractelement <16 x i16> %b, i32 5
780  %add71 = add i16 %vecext67, %vecext69
781  %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 10
782  %vecext74 = extractelement <16 x i16> %b, i32 6
783  %vecext76 = extractelement <16 x i16> %b, i32 7
784  %add78 = add i16 %vecext74, %vecext76
785  %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 11
786  %vecext81 = extractelement <16 x i16> %b, i32 8
787  %vecext83 = extractelement <16 x i16> %b, i32 9
788  %add85 = add i16 %vecext81, %vecext83
789  %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12
790  %vecext88 = extractelement <16 x i16> %b, i32 10
791  %vecext90 = extractelement <16 x i16> %b, i32 11
792  %add92 = add i16 %vecext88, %vecext90
793  %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13
794  %vecext95 = extractelement <16 x i16> %b, i32 12
795  %vecext97 = extractelement <16 x i16> %b, i32 13
796  %add99 = add i16 %vecext95, %vecext97
797  %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14
798  %vecext102 = extractelement <16 x i16> %b, i32 14
799  %vecext104 = extractelement <16 x i16> %b, i32 15
800  %add106 = add i16 %vecext102, %vecext104
801  %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
802  ret <16 x i16> %vecinit108
803}
804
805; Verify that we don't select horizontal subs in the following functions.
806
807define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
808; SSE-LABEL: not_a_hsub_1:
809; SSE:       # %bb.0:
810; SSE-NEXT:    movd %xmm0, %eax
811; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
812; SSE-NEXT:    movd %xmm2, %ecx
813; SSE-NEXT:    subl %ecx, %eax
814; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
815; SSE-NEXT:    movd %xmm2, %ecx
816; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
817; SSE-NEXT:    movd %xmm0, %edx
818; SSE-NEXT:    subl %edx, %ecx
819; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
820; SSE-NEXT:    movd %xmm0, %edx
821; SSE-NEXT:    movd %xmm1, %esi
822; SSE-NEXT:    subl %esi, %edx
823; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
824; SSE-NEXT:    movd %xmm0, %esi
825; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
826; SSE-NEXT:    movd %xmm0, %edi
827; SSE-NEXT:    subl %edi, %esi
828; SSE-NEXT:    movd %esi, %xmm0
829; SSE-NEXT:    movd %edx, %xmm1
830; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
831; SSE-NEXT:    movd %ecx, %xmm2
832; SSE-NEXT:    movd %eax, %xmm0
833; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
834; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
835; SSE-NEXT:    retq
836;
837; AVX-LABEL: not_a_hsub_1:
838; AVX:       # %bb.0:
839; AVX-NEXT:    vmovd %xmm0, %eax
840; AVX-NEXT:    vpextrd $1, %xmm0, %ecx
841; AVX-NEXT:    subl %ecx, %eax
842; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
843; AVX-NEXT:    vpextrd $3, %xmm0, %edx
844; AVX-NEXT:    subl %edx, %ecx
845; AVX-NEXT:    vpextrd $1, %xmm1, %edx
846; AVX-NEXT:    vmovd %xmm1, %esi
847; AVX-NEXT:    subl %esi, %edx
848; AVX-NEXT:    vpextrd $3, %xmm1, %esi
849; AVX-NEXT:    vpextrd $2, %xmm1, %edi
850; AVX-NEXT:    subl %edi, %esi
851; AVX-NEXT:    vmovd %eax, %xmm0
852; AVX-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
853; AVX-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
854; AVX-NEXT:    vpinsrd $3, %esi, %xmm0, %xmm0
855; AVX-NEXT:    retq
856  %vecext = extractelement <4 x i32> %A, i32 0
857  %vecext1 = extractelement <4 x i32> %A, i32 1
858  %sub = sub i32 %vecext, %vecext1
859  %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0
860  %vecext2 = extractelement <4 x i32> %A, i32 2
861  %vecext3 = extractelement <4 x i32> %A, i32 3
862  %sub4 = sub i32 %vecext2, %vecext3
863  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1
864  %vecext6 = extractelement <4 x i32> %B, i32 1
865  %vecext7 = extractelement <4 x i32> %B, i32 0
866  %sub8 = sub i32 %vecext6, %vecext7
867  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2
868  %vecext10 = extractelement <4 x i32> %B, i32 3
869  %vecext11 = extractelement <4 x i32> %B, i32 2
870  %sub12 = sub i32 %vecext10, %vecext11
871  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
872  ret <4 x i32> %vecinit13
873}
874
875define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
876; SSE-LABEL: not_a_hsub_2:
877; SSE:       # %bb.0:
878; SSE-NEXT:    movaps %xmm0, %xmm2
879; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
880; SSE-NEXT:    movaps %xmm0, %xmm3
881; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3]
882; SSE-NEXT:    subss %xmm3, %xmm2
883; SSE-NEXT:    movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
884; SSE-NEXT:    subss %xmm3, %xmm0
885; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
886; SSE-NEXT:    movaps %xmm1, %xmm2
887; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3]
888; SSE-NEXT:    movaps %xmm1, %xmm3
889; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
890; SSE-NEXT:    subss %xmm3, %xmm2
891; SSE-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
892; SSE-NEXT:    subss %xmm3, %xmm1
893; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
894; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
895; SSE-NEXT:    retq
896;
897; AVX-LABEL: not_a_hsub_2:
898; AVX:       # %bb.0:
899; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
900; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
901; AVX-NEXT:    vsubss %xmm3, %xmm2, %xmm2
902; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
903; AVX-NEXT:    vsubss %xmm3, %xmm0, %xmm0
904; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
905; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
906; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
907; AVX-NEXT:    vsubss %xmm3, %xmm2, %xmm2
908; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
909; AVX-NEXT:    vsubss %xmm3, %xmm1, %xmm1
910; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
911; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
912; AVX-NEXT:    retq
913  %vecext = extractelement <4 x float> %A, i32 2
914  %vecext1 = extractelement <4 x float> %A, i32 3
915  %sub = fsub float %vecext, %vecext1
916  %vecinit = insertelement <4 x float> undef, float %sub, i32 1
917  %vecext2 = extractelement <4 x float> %A, i32 0
918  %vecext3 = extractelement <4 x float> %A, i32 1
919  %sub4 = fsub float %vecext2, %vecext3
920  %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0
921  %vecext6 = extractelement <4 x float> %B, i32 3
922  %vecext7 = extractelement <4 x float> %B, i32 2
923  %sub8 = fsub float %vecext6, %vecext7
924  %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3
925  %vecext10 = extractelement <4 x float> %B, i32 0
926  %vecext11 = extractelement <4 x float> %B, i32 1
927  %sub12 = fsub float %vecext10, %vecext11
928  %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
929  ret <4 x float> %vecinit13
930}
931
932define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) {
933; SSE-LABEL: not_a_hsub_3:
934; SSE:       # %bb.0:
935; SSE-NEXT:    movapd %xmm1, %xmm2
936; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
937; SSE-NEXT:    subsd %xmm2, %xmm1
938; SSE-NEXT:    movapd %xmm0, %xmm2
939; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
940; SSE-NEXT:    subsd %xmm0, %xmm2
941; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
942; SSE-NEXT:    movapd %xmm2, %xmm0
943; SSE-NEXT:    retq
944;
945; AVX-LABEL: not_a_hsub_3:
946; AVX:       # %bb.0:
947; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
948; AVX-NEXT:    vsubsd %xmm2, %xmm1, %xmm1
949; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
950; AVX-NEXT:    vsubsd %xmm0, %xmm2, %xmm0
951; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
952; AVX-NEXT:    retq
953  %vecext = extractelement <2 x double> %B, i32 0
954  %vecext1 = extractelement <2 x double> %B, i32 1
955  %sub = fsub double %vecext, %vecext1
956  %vecinit = insertelement <2 x double> undef, double %sub, i32 1
957  %vecext2 = extractelement <2 x double> %A, i32 1
958  %vecext3 = extractelement <2 x double> %A, i32 0
959  %sub2 = fsub double %vecext2, %vecext3
960  %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
961  ret <2 x double> %vecinit2
962}
963
964; Test AVX horizontal add/sub of packed single/double precision
965; floating point values from 256-bit vectors.
966
967define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) {
968; SSE-LABEL: avx_vhadd_ps:
969; SSE:       # %bb.0:
970; SSE-NEXT:    haddps %xmm2, %xmm0
971; SSE-NEXT:    haddps %xmm3, %xmm1
972; SSE-NEXT:    retq
973;
974; AVX-LABEL: avx_vhadd_ps:
975; AVX:       # %bb.0:
976; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
977; AVX-NEXT:    retq
978  %vecext = extractelement <8 x float> %a, i32 0
979  %vecext1 = extractelement <8 x float> %a, i32 1
980  %add = fadd float %vecext, %vecext1
981  %vecinit = insertelement <8 x float> undef, float %add, i32 0
982  %vecext2 = extractelement <8 x float> %a, i32 2
983  %vecext3 = extractelement <8 x float> %a, i32 3
984  %add4 = fadd float %vecext2, %vecext3
985  %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1
986  %vecext6 = extractelement <8 x float> %b, i32 0
987  %vecext7 = extractelement <8 x float> %b, i32 1
988  %add8 = fadd float %vecext6, %vecext7
989  %vecinit9 = insertelement <8 x float> %vecinit5, float %add8, i32 2
990  %vecext10 = extractelement <8 x float> %b, i32 2
991  %vecext11 = extractelement <8 x float> %b, i32 3
992  %add12 = fadd float %vecext10, %vecext11
993  %vecinit13 = insertelement <8 x float> %vecinit9, float %add12, i32 3
994  %vecext14 = extractelement <8 x float> %a, i32 4
995  %vecext15 = extractelement <8 x float> %a, i32 5
996  %add16 = fadd float %vecext14, %vecext15
997  %vecinit17 = insertelement <8 x float> %vecinit13, float %add16, i32 4
998  %vecext18 = extractelement <8 x float> %a, i32 6
999  %vecext19 = extractelement <8 x float> %a, i32 7
1000  %add20 = fadd float %vecext18, %vecext19
1001  %vecinit21 = insertelement <8 x float> %vecinit17, float %add20, i32 5
1002  %vecext22 = extractelement <8 x float> %b, i32 4
1003  %vecext23 = extractelement <8 x float> %b, i32 5
1004  %add24 = fadd float %vecext22, %vecext23
1005  %vecinit25 = insertelement <8 x float> %vecinit21, float %add24, i32 6
1006  %vecext26 = extractelement <8 x float> %b, i32 6
1007  %vecext27 = extractelement <8 x float> %b, i32 7
1008  %add28 = fadd float %vecext26, %vecext27
1009  %vecinit29 = insertelement <8 x float> %vecinit25, float %add28, i32 7
1010  ret <8 x float> %vecinit29
1011}
1012
1013define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) {
1014; SSE-LABEL: avx_vhsub_ps:
1015; SSE:       # %bb.0:
1016; SSE-NEXT:    hsubps %xmm2, %xmm0
1017; SSE-NEXT:    hsubps %xmm3, %xmm1
1018; SSE-NEXT:    retq
1019;
1020; AVX-LABEL: avx_vhsub_ps:
1021; AVX:       # %bb.0:
1022; AVX-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
1023; AVX-NEXT:    retq
1024  %vecext = extractelement <8 x float> %a, i32 0
1025  %vecext1 = extractelement <8 x float> %a, i32 1
1026  %sub = fsub float %vecext, %vecext1
1027  %vecinit = insertelement <8 x float> undef, float %sub, i32 0
1028  %vecext2 = extractelement <8 x float> %a, i32 2
1029  %vecext3 = extractelement <8 x float> %a, i32 3
1030  %sub4 = fsub float %vecext2, %vecext3
1031  %vecinit5 = insertelement <8 x float> %vecinit, float %sub4, i32 1
1032  %vecext6 = extractelement <8 x float> %b, i32 0
1033  %vecext7 = extractelement <8 x float> %b, i32 1
1034  %sub8 = fsub float %vecext6, %vecext7
1035  %vecinit9 = insertelement <8 x float> %vecinit5, float %sub8, i32 2
1036  %vecext10 = extractelement <8 x float> %b, i32 2
1037  %vecext11 = extractelement <8 x float> %b, i32 3
1038  %sub12 = fsub float %vecext10, %vecext11
1039  %vecinit13 = insertelement <8 x float> %vecinit9, float %sub12, i32 3
1040  %vecext14 = extractelement <8 x float> %a, i32 4
1041  %vecext15 = extractelement <8 x float> %a, i32 5
1042  %sub16 = fsub float %vecext14, %vecext15
1043  %vecinit17 = insertelement <8 x float> %vecinit13, float %sub16, i32 4
1044  %vecext18 = extractelement <8 x float> %a, i32 6
1045  %vecext19 = extractelement <8 x float> %a, i32 7
1046  %sub20 = fsub float %vecext18, %vecext19
1047  %vecinit21 = insertelement <8 x float> %vecinit17, float %sub20, i32 5
1048  %vecext22 = extractelement <8 x float> %b, i32 4
1049  %vecext23 = extractelement <8 x float> %b, i32 5
1050  %sub24 = fsub float %vecext22, %vecext23
1051  %vecinit25 = insertelement <8 x float> %vecinit21, float %sub24, i32 6
1052  %vecext26 = extractelement <8 x float> %b, i32 6
1053  %vecext27 = extractelement <8 x float> %b, i32 7
1054  %sub28 = fsub float %vecext26, %vecext27
1055  %vecinit29 = insertelement <8 x float> %vecinit25, float %sub28, i32 7
1056  ret <8 x float> %vecinit29
1057}
1058
1059define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) {
1060; SSE-LABEL: avx_hadd_pd:
1061; SSE:       # %bb.0:
1062; SSE-NEXT:    haddpd %xmm2, %xmm0
1063; SSE-NEXT:    haddpd %xmm3, %xmm1
1064; SSE-NEXT:    retq
1065;
1066; AVX-LABEL: avx_hadd_pd:
1067; AVX:       # %bb.0:
1068; AVX-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
1069; AVX-NEXT:    retq
1070  %vecext = extractelement <4 x double> %a, i32 0
1071  %vecext1 = extractelement <4 x double> %a, i32 1
1072  %add = fadd double %vecext, %vecext1
1073  %vecinit = insertelement <4 x double> undef, double %add, i32 0
1074  %vecext2 = extractelement <4 x double> %b, i32 0
1075  %vecext3 = extractelement <4 x double> %b, i32 1
1076  %add4 = fadd double %vecext2, %vecext3
1077  %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1
1078  %vecext6 = extractelement <4 x double> %a, i32 2
1079  %vecext7 = extractelement <4 x double> %a, i32 3
1080  %add8 = fadd double %vecext6, %vecext7
1081  %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2
1082  %vecext10 = extractelement <4 x double> %b, i32 2
1083  %vecext11 = extractelement <4 x double> %b, i32 3
1084  %add12 = fadd double %vecext10, %vecext11
1085  %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
1086  ret <4 x double> %vecinit13
1087}
1088
1089define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) {
1090; SSE-LABEL: avx_hsub_pd:
1091; SSE:       # %bb.0:
1092; SSE-NEXT:    hsubpd %xmm2, %xmm0
1093; SSE-NEXT:    hsubpd %xmm3, %xmm1
1094; SSE-NEXT:    retq
1095;
1096; AVX-LABEL: avx_hsub_pd:
1097; AVX:       # %bb.0:
1098; AVX-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
1099; AVX-NEXT:    retq
1100  %vecext = extractelement <4 x double> %a, i32 0
1101  %vecext1 = extractelement <4 x double> %a, i32 1
1102  %sub = fsub double %vecext, %vecext1
1103  %vecinit = insertelement <4 x double> undef, double %sub, i32 0
1104  %vecext2 = extractelement <4 x double> %b, i32 0
1105  %vecext3 = extractelement <4 x double> %b, i32 1
1106  %sub4 = fsub double %vecext2, %vecext3
1107  %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1
1108  %vecext6 = extractelement <4 x double> %a, i32 2
1109  %vecext7 = extractelement <4 x double> %a, i32 3
1110  %sub8 = fsub double %vecext6, %vecext7
1111  %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2
1112  %vecext10 = extractelement <4 x double> %b, i32 2
1113  %vecext11 = extractelement <4 x double> %b, i32 3
1114  %sub12 = fsub double %vecext10, %vecext11
1115  %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
1116  ret <4 x double> %vecinit13
1117}
1118
1119; Test AVX2 horizontal add of packed integer values from 256-bit vectors.
1120
1121define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
1122; SSE3-LABEL: avx2_hadd_d:
1123; SSE3:       # %bb.0:
1124; SSE3-NEXT:    movd %xmm0, %ecx
1125; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
1126; SSE3-NEXT:    movd %xmm4, %r8d
1127; SSE3-NEXT:    addl %ecx, %r8d
1128; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1129; SSE3-NEXT:    movd %xmm4, %edx
1130; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1131; SSE3-NEXT:    movd %xmm0, %r9d
1132; SSE3-NEXT:    addl %edx, %r9d
1133; SSE3-NEXT:    movd %xmm2, %edx
1134; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
1135; SSE3-NEXT:    movd %xmm0, %esi
1136; SSE3-NEXT:    addl %edx, %esi
1137; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
1138; SSE3-NEXT:    movd %xmm0, %edx
1139; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3]
1140; SSE3-NEXT:    movd %xmm0, %edi
1141; SSE3-NEXT:    addl %edx, %edi
1142; SSE3-NEXT:    movd %xmm1, %eax
1143; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1144; SSE3-NEXT:    movd %xmm0, %r10d
1145; SSE3-NEXT:    addl %eax, %r10d
1146; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1147; SSE3-NEXT:    movd %xmm0, %eax
1148; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
1149; SSE3-NEXT:    movd %xmm0, %ecx
1150; SSE3-NEXT:    addl %eax, %ecx
1151; SSE3-NEXT:    movd %xmm3, %eax
1152; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
1153; SSE3-NEXT:    movd %xmm0, %edx
1154; SSE3-NEXT:    addl %eax, %edx
1155; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
1156; SSE3-NEXT:    movd %xmm0, %r11d
1157; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3]
1158; SSE3-NEXT:    movd %xmm0, %eax
1159; SSE3-NEXT:    addl %r11d, %eax
1160; SSE3-NEXT:    movd %edi, %xmm0
1161; SSE3-NEXT:    movd %esi, %xmm1
1162; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1163; SSE3-NEXT:    movd %r9d, %xmm2
1164; SSE3-NEXT:    movd %r8d, %xmm0
1165; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1166; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1167; SSE3-NEXT:    movd %eax, %xmm1
1168; SSE3-NEXT:    movd %edx, %xmm2
1169; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1170; SSE3-NEXT:    movd %ecx, %xmm3
1171; SSE3-NEXT:    movd %r10d, %xmm1
1172; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1173; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1174; SSE3-NEXT:    retq
1175;
1176; SSSE3-LABEL: avx2_hadd_d:
1177; SSSE3:       # %bb.0:
1178; SSSE3-NEXT:    phaddd %xmm2, %xmm0
1179; SSSE3-NEXT:    phaddd %xmm3, %xmm1
1180; SSSE3-NEXT:    retq
1181;
1182; AVX1-LABEL: avx2_hadd_d:
1183; AVX1:       # %bb.0:
1184; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1185; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1186; AVX1-NEXT:    vphaddd %xmm2, %xmm3, %xmm2
1187; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
1188; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1189; AVX1-NEXT:    retq
1190;
1191; AVX2-LABEL: avx2_hadd_d:
1192; AVX2:       # %bb.0:
1193; AVX2-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
1194; AVX2-NEXT:    retq
1195  %vecext = extractelement <8 x i32> %a, i32 0
1196  %vecext1 = extractelement <8 x i32> %a, i32 1
1197  %add = add i32 %vecext, %vecext1
1198  %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
1199  %vecext2 = extractelement <8 x i32> %a, i32 2
1200  %vecext3 = extractelement <8 x i32> %a, i32 3
1201  %add4 = add i32 %vecext2, %vecext3
1202  %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
1203  %vecext6 = extractelement <8 x i32> %b, i32 0
1204  %vecext7 = extractelement <8 x i32> %b, i32 1
1205  %add8 = add i32 %vecext6, %vecext7
1206  %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2
1207  %vecext10 = extractelement <8 x i32> %b, i32 2
1208  %vecext11 = extractelement <8 x i32> %b, i32 3
1209  %add12 = add i32 %vecext10, %vecext11
1210  %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3
1211  %vecext14 = extractelement <8 x i32> %a, i32 4
1212  %vecext15 = extractelement <8 x i32> %a, i32 5
1213  %add16 = add i32 %vecext14, %vecext15
1214  %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4
1215  %vecext18 = extractelement <8 x i32> %a, i32 6
1216  %vecext19 = extractelement <8 x i32> %a, i32 7
1217  %add20 = add i32 %vecext18, %vecext19
1218  %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5
1219  %vecext22 = extractelement <8 x i32> %b, i32 4
1220  %vecext23 = extractelement <8 x i32> %b, i32 5
1221  %add24 = add i32 %vecext22, %vecext23
1222  %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6
1223  %vecext26 = extractelement <8 x i32> %b, i32 6
1224  %vecext27 = extractelement <8 x i32> %b, i32 7
1225  %add28 = add i32 %vecext26, %vecext27
1226  %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
1227  ret <8 x i32> %vecinit29
1228}
1229
1230define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) nounwind {
1231; SSE3-LABEL: avx2_hadd_w:
1232; SSE3:       # %bb.0:
1233; SSE3-NEXT:    pushq %rbp
1234; SSE3-NEXT:    pushq %r15
1235; SSE3-NEXT:    pushq %r14
1236; SSE3-NEXT:    pushq %r13
1237; SSE3-NEXT:    pushq %r12
1238; SSE3-NEXT:    pushq %rbx
1239; SSE3-NEXT:    movd %xmm0, %eax
1240; SSE3-NEXT:    pextrw $1, %xmm0, %r10d
1241; SSE3-NEXT:    addl %eax, %r10d
1242; SSE3-NEXT:    pextrw $2, %xmm0, %eax
1243; SSE3-NEXT:    pextrw $3, %xmm0, %r11d
1244; SSE3-NEXT:    addl %eax, %r11d
1245; SSE3-NEXT:    pextrw $4, %xmm0, %eax
1246; SSE3-NEXT:    pextrw $5, %xmm0, %r12d
1247; SSE3-NEXT:    addl %eax, %r12d
1248; SSE3-NEXT:    pextrw $6, %xmm0, %eax
1249; SSE3-NEXT:    pextrw $7, %xmm0, %r13d
1250; SSE3-NEXT:    addl %eax, %r13d
1251; SSE3-NEXT:    movd %xmm1, %eax
1252; SSE3-NEXT:    pextrw $1, %xmm1, %ecx
1253; SSE3-NEXT:    addl %eax, %ecx
1254; SSE3-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1255; SSE3-NEXT:    pextrw $2, %xmm1, %eax
1256; SSE3-NEXT:    pextrw $3, %xmm1, %ecx
1257; SSE3-NEXT:    addl %eax, %ecx
1258; SSE3-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1259; SSE3-NEXT:    pextrw $4, %xmm1, %eax
1260; SSE3-NEXT:    pextrw $5, %xmm1, %r14d
1261; SSE3-NEXT:    addl %eax, %r14d
1262; SSE3-NEXT:    pextrw $6, %xmm1, %esi
1263; SSE3-NEXT:    pextrw $7, %xmm1, %r15d
1264; SSE3-NEXT:    addl %esi, %r15d
1265; SSE3-NEXT:    movd %xmm2, %esi
1266; SSE3-NEXT:    pextrw $1, %xmm2, %ebp
1267; SSE3-NEXT:    addl %esi, %ebp
1268; SSE3-NEXT:    pextrw $2, %xmm2, %esi
1269; SSE3-NEXT:    pextrw $3, %xmm2, %edi
1270; SSE3-NEXT:    addl %esi, %edi
1271; SSE3-NEXT:    pextrw $4, %xmm2, %esi
1272; SSE3-NEXT:    pextrw $5, %xmm2, %eax
1273; SSE3-NEXT:    addl %esi, %eax
1274; SSE3-NEXT:    pextrw $6, %xmm2, %esi
1275; SSE3-NEXT:    pextrw $7, %xmm2, %ecx
1276; SSE3-NEXT:    addl %esi, %ecx
1277; SSE3-NEXT:    movd %xmm3, %ebx
1278; SSE3-NEXT:    pextrw $1, %xmm3, %r9d
1279; SSE3-NEXT:    addl %ebx, %r9d
1280; SSE3-NEXT:    pextrw $2, %xmm3, %edx
1281; SSE3-NEXT:    pextrw $3, %xmm3, %ebx
1282; SSE3-NEXT:    addl %edx, %ebx
1283; SSE3-NEXT:    pextrw $4, %xmm3, %edx
1284; SSE3-NEXT:    pextrw $5, %xmm3, %esi
1285; SSE3-NEXT:    addl %edx, %esi
1286; SSE3-NEXT:    pextrw $6, %xmm3, %r8d
1287; SSE3-NEXT:    pextrw $7, %xmm3, %edx
1288; SSE3-NEXT:    addl %r8d, %edx
1289; SSE3-NEXT:    movd %ecx, %xmm8
1290; SSE3-NEXT:    movd %eax, %xmm3
1291; SSE3-NEXT:    movd %edi, %xmm9
1292; SSE3-NEXT:    movd %ebp, %xmm4
1293; SSE3-NEXT:    movd %r13d, %xmm10
1294; SSE3-NEXT:    movd %r12d, %xmm7
1295; SSE3-NEXT:    movd %r11d, %xmm11
1296; SSE3-NEXT:    movd %r10d, %xmm0
1297; SSE3-NEXT:    movd %edx, %xmm12
1298; SSE3-NEXT:    movd %esi, %xmm6
1299; SSE3-NEXT:    movd %ebx, %xmm13
1300; SSE3-NEXT:    movd %r9d, %xmm5
1301; SSE3-NEXT:    movd %r15d, %xmm14
1302; SSE3-NEXT:    movd %r14d, %xmm2
1303; SSE3-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload
1304; SSE3-NEXT:    # xmm15 = mem[0],zero,zero,zero
1305; SSE3-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload
1306; SSE3-NEXT:    # xmm1 = mem[0],zero,zero,zero
1307; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
1308; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
1309; SSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
1310; SSE3-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
1311; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
1312; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
1313; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
1314; SSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
1315; SSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
1316; SSE3-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
1317; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
1318; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
1319; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1320; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
1321; SSE3-NEXT:    popq %rbx
1322; SSE3-NEXT:    popq %r12
1323; SSE3-NEXT:    popq %r13
1324; SSE3-NEXT:    popq %r14
1325; SSE3-NEXT:    popq %r15
1326; SSE3-NEXT:    popq %rbp
1327; SSE3-NEXT:    retq
1328;
1329; SSSE3-LABEL: avx2_hadd_w:
1330; SSSE3:       # %bb.0:
1331; SSSE3-NEXT:    phaddw %xmm2, %xmm0
1332; SSSE3-NEXT:    phaddw %xmm3, %xmm1
1333; SSSE3-NEXT:    retq
1334;
1335; AVX1-LABEL: avx2_hadd_w:
1336; AVX1:       # %bb.0:
1337; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1338; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1339; AVX1-NEXT:    vphaddw %xmm2, %xmm3, %xmm2
1340; AVX1-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
1341; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1342; AVX1-NEXT:    retq
1343;
1344; AVX2-LABEL: avx2_hadd_w:
1345; AVX2:       # %bb.0:
1346; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
1347; AVX2-NEXT:    retq
1348  %vecext = extractelement <16 x i16> %a, i32 0
1349  %vecext1 = extractelement <16 x i16> %a, i32 1
1350  %add = add i16 %vecext, %vecext1
1351  %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0
1352  %vecext4 = extractelement <16 x i16> %a, i32 2
1353  %vecext6 = extractelement <16 x i16> %a, i32 3
1354  %add8 = add i16 %vecext4, %vecext6
1355  %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1
1356  %vecext11 = extractelement <16 x i16> %a, i32 4
1357  %vecext13 = extractelement <16 x i16> %a, i32 5
1358  %add15 = add i16 %vecext11, %vecext13
1359  %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2
1360  %vecext18 = extractelement <16 x i16> %a, i32 6
1361  %vecext20 = extractelement <16 x i16> %a, i32 7
1362  %add22 = add i16 %vecext18, %vecext20
1363  %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3
1364  %vecext25 = extractelement <16 x i16> %a, i32 8
1365  %vecext27 = extractelement <16 x i16> %a, i32 9
1366  %add29 = add i16 %vecext25, %vecext27
1367  %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 8
1368  %vecext32 = extractelement <16 x i16> %a, i32 10
1369  %vecext34 = extractelement <16 x i16> %a, i32 11
1370  %add36 = add i16 %vecext32, %vecext34
1371  %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 9
1372  %vecext39 = extractelement <16 x i16> %a, i32 12
1373  %vecext41 = extractelement <16 x i16> %a, i32 13
1374  %add43 = add i16 %vecext39, %vecext41
1375  %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 10
1376  %vecext46 = extractelement <16 x i16> %a, i32 14
1377  %vecext48 = extractelement <16 x i16> %a, i32 15
1378  %add50 = add i16 %vecext46, %vecext48
1379  %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 11
1380  %vecext53 = extractelement <16 x i16> %b, i32 0
1381  %vecext55 = extractelement <16 x i16> %b, i32 1
1382  %add57 = add i16 %vecext53, %vecext55
1383  %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 4
1384  %vecext60 = extractelement <16 x i16> %b, i32 2
1385  %vecext62 = extractelement <16 x i16> %b, i32 3
1386  %add64 = add i16 %vecext60, %vecext62
1387  %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 5
1388  %vecext67 = extractelement <16 x i16> %b, i32 4
1389  %vecext69 = extractelement <16 x i16> %b, i32 5
1390  %add71 = add i16 %vecext67, %vecext69
1391  %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 6
1392  %vecext74 = extractelement <16 x i16> %b, i32 6
1393  %vecext76 = extractelement <16 x i16> %b, i32 7
1394  %add78 = add i16 %vecext74, %vecext76
1395  %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 7
1396  %vecext81 = extractelement <16 x i16> %b, i32 8
1397  %vecext83 = extractelement <16 x i16> %b, i32 9
1398  %add85 = add i16 %vecext81, %vecext83
1399  %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12
1400  %vecext88 = extractelement <16 x i16> %b, i32 10
1401  %vecext90 = extractelement <16 x i16> %b, i32 11
1402  %add92 = add i16 %vecext88, %vecext90
1403  %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13
1404  %vecext95 = extractelement <16 x i16> %b, i32 12
1405  %vecext97 = extractelement <16 x i16> %b, i32 13
1406  %add99 = add i16 %vecext95, %vecext97
1407  %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14
1408  %vecext102 = extractelement <16 x i16> %b, i32 14
1409  %vecext104 = extractelement <16 x i16> %b, i32 15
1410  %add106 = add i16 %vecext102, %vecext104
1411  %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
1412  ret <16 x i16> %vecinit108
1413}
1414