1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3              | FileCheck %s --check-prefixes=SSE,SSE-SLOW
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops    | FileCheck %s --check-prefixes=SSE,SSE-FAST
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx                | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops      | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2               | FileCheck %s --check-prefixes=AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops     | FileCheck %s --check-prefixes=AVX,AVX2
8; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl           | FileCheck %s --check-prefixes=AVX,AVX512
9; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl,fast-hops | FileCheck %s --check-prefixes=AVX,AVX512
10
11; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
12
13define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
14; SSE-LABEL: test14_undef:
15; SSE:       # %bb.0:
16; SSE-NEXT:    phaddd %xmm2, %xmm0
17; SSE-NEXT:    retq
18;
19; AVX-LABEL: test14_undef:
20; AVX:       # %bb.0:
21; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
22; AVX-NEXT:    retq
23  %vecext = extractelement <8 x i32> %a, i32 0
24  %vecext1 = extractelement <8 x i32> %a, i32 1
25  %add = add i32 %vecext, %vecext1
26  %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
27  %vecext2 = extractelement <8 x i32> %b, i32 2
28  %vecext3 = extractelement <8 x i32> %b, i32 3
29  %add4 = add i32 %vecext2, %vecext3
30  %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 3
31  ret <8 x i32> %vecinit5
32}
33
34; integer horizontal adds instead of two scalar adds followed by vector inserts.
35define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
36; SSE-SLOW-LABEL: test15_undef:
37; SSE-SLOW:       # %bb.0:
38; SSE-SLOW-NEXT:    movd %xmm0, %eax
39; SSE-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
40; SSE-SLOW-NEXT:    movd %xmm0, %ecx
41; SSE-SLOW-NEXT:    addl %eax, %ecx
42; SSE-SLOW-NEXT:    movd %xmm3, %eax
43; SSE-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
44; SSE-SLOW-NEXT:    movd %xmm0, %edx
45; SSE-SLOW-NEXT:    addl %eax, %edx
46; SSE-SLOW-NEXT:    movd %ecx, %xmm0
47; SSE-SLOW-NEXT:    movd %edx, %xmm1
48; SSE-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
49; SSE-SLOW-NEXT:    retq
50;
51; SSE-FAST-LABEL: test15_undef:
52; SSE-FAST:       # %bb.0:
53; SSE-FAST-NEXT:    movdqa %xmm3, %xmm1
54; SSE-FAST-NEXT:    phaddd %xmm0, %xmm0
55; SSE-FAST-NEXT:    phaddd %xmm3, %xmm1
56; SSE-FAST-NEXT:    retq
57;
58; AVX1-SLOW-LABEL: test15_undef:
59; AVX1-SLOW:       # %bb.0:
60; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
61; AVX1-SLOW-NEXT:    vpextrd $1, %xmm0, %ecx
62; AVX1-SLOW-NEXT:    addl %eax, %ecx
63; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm0
64; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
65; AVX1-SLOW-NEXT:    vpextrd $1, %xmm0, %edx
66; AVX1-SLOW-NEXT:    addl %eax, %edx
67; AVX1-SLOW-NEXT:    vmovd %ecx, %xmm0
68; AVX1-SLOW-NEXT:    vmovd %edx, %xmm1
69; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
70; AVX1-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
71; AVX1-SLOW-NEXT:    retq
72;
73; AVX1-FAST-LABEL: test15_undef:
74; AVX1-FAST:       # %bb.0:
75; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
76; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
77; AVX1-FAST-NEXT:    vphaddd %xmm1, %xmm1, %xmm1
78; AVX1-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
79; AVX1-FAST-NEXT:    retq
80;
81; AVX2-LABEL: test15_undef:
82; AVX2:       # %bb.0:
83; AVX2-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
84; AVX2-NEXT:    retq
85;
86; AVX512-LABEL: test15_undef:
87; AVX512:       # %bb.0:
88; AVX512-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
89; AVX512-NEXT:    retq
90  %vecext = extractelement <8 x i32> %a, i32 0
91  %vecext1 = extractelement <8 x i32> %a, i32 1
92  %add = add i32 %vecext, %vecext1
93  %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
94  %vecext2 = extractelement <8 x i32> %b, i32 4
95  %vecext3 = extractelement <8 x i32> %b, i32 5
96  %add4 = add i32 %vecext2, %vecext3
97  %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 6
98  ret <8 x i32> %vecinit5
99}
100
101define <8 x i32> @PR40243_alt(<8 x i32> %a, <8 x i32> %b) {
102; SSE-LABEL: PR40243_alt:
103; SSE:       # %bb.0:
104; SSE-NEXT:    phaddd %xmm3, %xmm1
105; SSE-NEXT:    retq
106;
107; AVX1-LABEL: PR40243_alt:
108; AVX1:       # %bb.0:
109; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
110; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
111; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
112; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
113; AVX1-NEXT:    retq
114;
115; AVX2-LABEL: PR40243_alt:
116; AVX2:       # %bb.0:
117; AVX2-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
118; AVX2-NEXT:    retq
119;
120; AVX512-LABEL: PR40243_alt:
121; AVX512:       # %bb.0:
122; AVX512-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
123; AVX512-NEXT:    retq
124  %a4 = extractelement <8 x i32> %a, i32 4
125  %a5 = extractelement <8 x i32> %a, i32 5
126  %add4 = add i32 %a4, %a5
127  %b6 = extractelement <8 x i32> %b, i32 6
128  %b7 = extractelement <8 x i32> %b, i32 7
129  %add7 = add i32 %b6, %b7
130  %r4 = insertelement <8 x i32> undef, i32 %add4, i32 4
131  %r = insertelement <8 x i32> %r4, i32 %add7, i32 7
132  ret <8 x i32> %r
133}
134
135define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) {
136; SSE-LABEL: test16_undef:
137; SSE:       # %bb.0:
138; SSE-NEXT:    phaddd %xmm0, %xmm0
139; SSE-NEXT:    retq
140;
141; AVX-LABEL: test16_undef:
142; AVX:       # %bb.0:
143; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
144; AVX-NEXT:    retq
145  %vecext = extractelement <8 x i32> %a, i32 0
146  %vecext1 = extractelement <8 x i32> %a, i32 1
147  %add = add i32 %vecext, %vecext1
148  %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
149  %vecext2 = extractelement <8 x i32> %a, i32 2
150  %vecext3 = extractelement <8 x i32> %a, i32 3
151  %add4 = add i32 %vecext2, %vecext3
152  %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
153  ret <8 x i32> %vecinit5
154}
155
156define <16 x i32> @test16_v16i32_undef(<16 x i32> %a, <16 x i32> %b) {
157; SSE-LABEL: test16_v16i32_undef:
158; SSE:       # %bb.0:
159; SSE-NEXT:    phaddd %xmm0, %xmm0
160; SSE-NEXT:    retq
161;
162; AVX-LABEL: test16_v16i32_undef:
163; AVX:       # %bb.0:
164; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
165; AVX-NEXT:    retq
166  %vecext = extractelement <16 x i32> %a, i32 0
167  %vecext1 = extractelement <16 x i32> %a, i32 1
168  %add = add i32 %vecext, %vecext1
169  %vecinit = insertelement <16 x i32> undef, i32 %add, i32 0
170  %vecext2 = extractelement <16 x i32> %a, i32 2
171  %vecext3 = extractelement <16 x i32> %a, i32 3
172  %add4 = add i32 %vecext2, %vecext3
173  %vecinit5 = insertelement <16 x i32> %vecinit, i32 %add4, i32 1
174  ret <16 x i32> %vecinit5
175}
176
177define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) {
178; SSE-LABEL: test17_undef:
179; SSE:       # %bb.0:
180; SSE-NEXT:    phaddd %xmm1, %xmm0
181; SSE-NEXT:    retq
182;
183; AVX1-LABEL: test17_undef:
184; AVX1:       # %bb.0:
185; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
186; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
187; AVX1-NEXT:    retq
188;
189; AVX2-LABEL: test17_undef:
190; AVX2:       # %bb.0:
191; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
192; AVX2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
193; AVX2-NEXT:    retq
194;
195; AVX512-LABEL: test17_undef:
196; AVX512:       # %bb.0:
197; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
198; AVX512-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
199; AVX512-NEXT:    retq
200  %vecext = extractelement <8 x i32> %a, i32 0
201  %vecext1 = extractelement <8 x i32> %a, i32 1
202  %add1 = add i32 %vecext, %vecext1
203  %vecinit1 = insertelement <8 x i32> undef, i32 %add1, i32 0
204  %vecext2 = extractelement <8 x i32> %a, i32 2
205  %vecext3 = extractelement <8 x i32> %a, i32 3
206  %add2 = add i32 %vecext2, %vecext3
207  %vecinit2 = insertelement <8 x i32> %vecinit1, i32 %add2, i32 1
208  %vecext4 = extractelement <8 x i32> %a, i32 4
209  %vecext5 = extractelement <8 x i32> %a, i32 5
210  %add3 = add i32 %vecext4, %vecext5
211  %vecinit3 = insertelement <8 x i32> %vecinit2, i32 %add3, i32 2
212  %vecext6 = extractelement <8 x i32> %a, i32 6
213  %vecext7 = extractelement <8 x i32> %a, i32 7
214  %add4 = add i32 %vecext6, %vecext7
215  %vecinit4 = insertelement <8 x i32> %vecinit3, i32 %add4, i32 3
216  ret <8 x i32> %vecinit4
217}
218
219define <16 x i32> @test17_v16i32_undef(<16 x i32> %a, <16 x i32> %b) {
220; SSE-LABEL: test17_v16i32_undef:
221; SSE:       # %bb.0:
222; SSE-NEXT:    phaddd %xmm1, %xmm0
223; SSE-NEXT:    retq
224;
225; AVX1-LABEL: test17_v16i32_undef:
226; AVX1:       # %bb.0:
227; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
228; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
229; AVX1-NEXT:    retq
230;
231; AVX2-LABEL: test17_v16i32_undef:
232; AVX2:       # %bb.0:
233; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
234; AVX2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
235; AVX2-NEXT:    retq
236;
237; AVX512-LABEL: test17_v16i32_undef:
238; AVX512:       # %bb.0:
239; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
240; AVX512-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
241; AVX512-NEXT:    retq
242  %vecext = extractelement <16 x i32> %a, i32 0
243  %vecext1 = extractelement <16 x i32> %a, i32 1
244  %add1 = add i32 %vecext, %vecext1
245  %vecinit1 = insertelement <16 x i32> undef, i32 %add1, i32 0
246  %vecext2 = extractelement <16 x i32> %a, i32 2
247  %vecext3 = extractelement <16 x i32> %a, i32 3
248  %add2 = add i32 %vecext2, %vecext3
249  %vecinit2 = insertelement <16 x i32> %vecinit1, i32 %add2, i32 1
250  %vecext4 = extractelement <16 x i32> %a, i32 4
251  %vecext5 = extractelement <16 x i32> %a, i32 5
252  %add3 = add i32 %vecext4, %vecext5
253  %vecinit3 = insertelement <16 x i32> %vecinit2, i32 %add3, i32 2
254  %vecext6 = extractelement <16 x i32> %a, i32 6
255  %vecext7 = extractelement <16 x i32> %a, i32 7
256  %add4 = add i32 %vecext6, %vecext7
257  %vecinit4 = insertelement <16 x i32> %vecinit3, i32 %add4, i32 3
258  ret <16 x i32> %vecinit4
259}
260
261