1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512
9
10;
11; vXi64
12;
13
14define i64 @test_v2i64(<2 x i64> %a0) {
15; SSE-LABEL: test_v2i64:
16; SSE:       # %bb.0:
17; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
18; SSE-NEXT:    paddq %xmm0, %xmm1
19; SSE-NEXT:    movq %xmm1, %rax
20; SSE-NEXT:    retq
21;
22; AVX-LABEL: test_v2i64:
23; AVX:       # %bb.0:
24; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
25; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
26; AVX-NEXT:    vmovq %xmm0, %rax
27; AVX-NEXT:    retq
28;
29; AVX512-LABEL: test_v2i64:
30; AVX512:       # %bb.0:
31; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
32; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
33; AVX512-NEXT:    vmovq %xmm0, %rax
34; AVX512-NEXT:    retq
35  %1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a0)
36  ret i64 %1
37}
38
39define i64 @test_v4i64(<4 x i64> %a0) {
40; SSE-LABEL: test_v4i64:
41; SSE:       # %bb.0:
42; SSE-NEXT:    paddq %xmm1, %xmm0
43; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
44; SSE-NEXT:    paddq %xmm0, %xmm1
45; SSE-NEXT:    movq %xmm1, %rax
46; SSE-NEXT:    retq
47;
48; AVX1-LABEL: test_v4i64:
49; AVX1:       # %bb.0:
50; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
51; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
52; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
53; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
54; AVX1-NEXT:    vmovq %xmm0, %rax
55; AVX1-NEXT:    vzeroupper
56; AVX1-NEXT:    retq
57;
58; AVX2-LABEL: test_v4i64:
59; AVX2:       # %bb.0:
60; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
61; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
62; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
63; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
64; AVX2-NEXT:    vmovq %xmm0, %rax
65; AVX2-NEXT:    vzeroupper
66; AVX2-NEXT:    retq
67;
68; AVX512-LABEL: test_v4i64:
69; AVX512:       # %bb.0:
70; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
71; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
72; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
73; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
74; AVX512-NEXT:    vmovq %xmm0, %rax
75; AVX512-NEXT:    vzeroupper
76; AVX512-NEXT:    retq
77  %1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a0)
78  ret i64 %1
79}
80
81define i64 @test_v8i64(<8 x i64> %a0) {
82; SSE-LABEL: test_v8i64:
83; SSE:       # %bb.0:
84; SSE-NEXT:    paddq %xmm3, %xmm1
85; SSE-NEXT:    paddq %xmm2, %xmm1
86; SSE-NEXT:    paddq %xmm0, %xmm1
87; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
88; SSE-NEXT:    paddq %xmm1, %xmm0
89; SSE-NEXT:    movq %xmm0, %rax
90; SSE-NEXT:    retq
91;
92; AVX1-LABEL: test_v8i64:
93; AVX1:       # %bb.0:
94; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
95; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
96; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
97; AVX1-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
98; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
99; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
100; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
101; AVX1-NEXT:    vmovq %xmm0, %rax
102; AVX1-NEXT:    vzeroupper
103; AVX1-NEXT:    retq
104;
105; AVX2-LABEL: test_v8i64:
106; AVX2:       # %bb.0:
107; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
108; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
109; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
110; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
111; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
112; AVX2-NEXT:    vmovq %xmm0, %rax
113; AVX2-NEXT:    vzeroupper
114; AVX2-NEXT:    retq
115;
116; AVX512-LABEL: test_v8i64:
117; AVX512:       # %bb.0:
118; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
119; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
120; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
121; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
122; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
123; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
124; AVX512-NEXT:    vmovq %xmm0, %rax
125; AVX512-NEXT:    vzeroupper
126; AVX512-NEXT:    retq
127  %1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a0)
128  ret i64 %1
129}
130
131define i64 @test_v16i64(<16 x i64> %a0) {
132; SSE-LABEL: test_v16i64:
133; SSE:       # %bb.0:
134; SSE-NEXT:    paddq %xmm6, %xmm2
135; SSE-NEXT:    paddq %xmm7, %xmm3
136; SSE-NEXT:    paddq %xmm5, %xmm3
137; SSE-NEXT:    paddq %xmm1, %xmm3
138; SSE-NEXT:    paddq %xmm4, %xmm2
139; SSE-NEXT:    paddq %xmm3, %xmm2
140; SSE-NEXT:    paddq %xmm0, %xmm2
141; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
142; SSE-NEXT:    paddq %xmm2, %xmm0
143; SSE-NEXT:    movq %xmm0, %rax
144; SSE-NEXT:    retq
145;
146; AVX1-LABEL: test_v16i64:
147; AVX1:       # %bb.0:
148; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm4
149; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
150; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
151; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
152; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
153; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
154; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
155; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
156; AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
157; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
158; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
159; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
160; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
161; AVX1-NEXT:    vmovq %xmm0, %rax
162; AVX1-NEXT:    vzeroupper
163; AVX1-NEXT:    retq
164;
165; AVX2-LABEL: test_v16i64:
166; AVX2:       # %bb.0:
167; AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
168; AVX2-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
169; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
170; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
171; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
172; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
173; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
174; AVX2-NEXT:    vmovq %xmm0, %rax
175; AVX2-NEXT:    vzeroupper
176; AVX2-NEXT:    retq
177;
178; AVX512-LABEL: test_v16i64:
179; AVX512:       # %bb.0:
180; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
181; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
182; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
183; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
184; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
185; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
186; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
187; AVX512-NEXT:    vmovq %xmm0, %rax
188; AVX512-NEXT:    vzeroupper
189; AVX512-NEXT:    retq
190  %1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a0)
191  ret i64 %1
192}
193
194;
195; vXi32
196;
197
198define i32 @test_v2i32(<2 x i32> %a0) {
199; SSE-LABEL: test_v2i32:
200; SSE:       # %bb.0:
201; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
202; SSE-NEXT:    paddd %xmm0, %xmm1
203; SSE-NEXT:    movd %xmm1, %eax
204; SSE-NEXT:    retq
205;
206; AVX1-SLOW-LABEL: test_v2i32:
207; AVX1-SLOW:       # %bb.0:
208; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
209; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
210; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
211; AVX1-SLOW-NEXT:    retq
212;
213; AVX1-FAST-LABEL: test_v2i32:
214; AVX1-FAST:       # %bb.0:
215; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
216; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
217; AVX1-FAST-NEXT:    retq
218;
219; AVX2-LABEL: test_v2i32:
220; AVX2:       # %bb.0:
221; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
222; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
223; AVX2-NEXT:    vmovd %xmm0, %eax
224; AVX2-NEXT:    retq
225;
226; AVX512-LABEL: test_v2i32:
227; AVX512:       # %bb.0:
228; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
229; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
230; AVX512-NEXT:    vmovd %xmm0, %eax
231; AVX512-NEXT:    retq
232  %1 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a0)
233  ret i32 %1
234}
235
236define i32 @test_v4i32(<4 x i32> %a0) {
237; SSE-LABEL: test_v4i32:
238; SSE:       # %bb.0:
239; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
240; SSE-NEXT:    paddd %xmm0, %xmm1
241; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
242; SSE-NEXT:    paddd %xmm1, %xmm0
243; SSE-NEXT:    movd %xmm0, %eax
244; SSE-NEXT:    retq
245;
246; AVX1-SLOW-LABEL: test_v4i32:
247; AVX1-SLOW:       # %bb.0:
248; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
249; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
250; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
251; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
252; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
253; AVX1-SLOW-NEXT:    retq
254;
255; AVX1-FAST-LABEL: test_v4i32:
256; AVX1-FAST:       # %bb.0:
257; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
258; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
259; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
260; AVX1-FAST-NEXT:    retq
261;
262; AVX2-LABEL: test_v4i32:
263; AVX2:       # %bb.0:
264; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
265; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
266; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
267; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
268; AVX2-NEXT:    vmovd %xmm0, %eax
269; AVX2-NEXT:    retq
270;
271; AVX512-LABEL: test_v4i32:
272; AVX512:       # %bb.0:
273; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
274; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
275; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
276; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
277; AVX512-NEXT:    vmovd %xmm0, %eax
278; AVX512-NEXT:    retq
279  %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a0)
280  ret i32 %1
281}
282
283define i32 @test_v8i32(<8 x i32> %a0) {
284; SSE-LABEL: test_v8i32:
285; SSE:       # %bb.0:
286; SSE-NEXT:    paddd %xmm1, %xmm0
287; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
288; SSE-NEXT:    paddd %xmm0, %xmm1
289; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
290; SSE-NEXT:    paddd %xmm1, %xmm0
291; SSE-NEXT:    movd %xmm0, %eax
292; SSE-NEXT:    retq
293;
294; AVX1-SLOW-LABEL: test_v8i32:
295; AVX1-SLOW:       # %bb.0:
296; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
297; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
298; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
299; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
300; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
301; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
302; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
303; AVX1-SLOW-NEXT:    vzeroupper
304; AVX1-SLOW-NEXT:    retq
305;
306; AVX1-FAST-LABEL: test_v8i32:
307; AVX1-FAST:       # %bb.0:
308; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
309; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm1, %xmm0
310; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
311; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
312; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
313; AVX1-FAST-NEXT:    vzeroupper
314; AVX1-FAST-NEXT:    retq
315;
316; AVX2-LABEL: test_v8i32:
317; AVX2:       # %bb.0:
318; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
319; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
320; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
321; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
322; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
323; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
324; AVX2-NEXT:    vmovd %xmm0, %eax
325; AVX2-NEXT:    vzeroupper
326; AVX2-NEXT:    retq
327;
328; AVX512-LABEL: test_v8i32:
329; AVX512:       # %bb.0:
330; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
331; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
332; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
333; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
334; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
335; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
336; AVX512-NEXT:    vmovd %xmm0, %eax
337; AVX512-NEXT:    vzeroupper
338; AVX512-NEXT:    retq
339  %1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a0)
340  ret i32 %1
341}
342
343define i32 @test_v16i32(<16 x i32> %a0) {
344; SSE-LABEL: test_v16i32:
345; SSE:       # %bb.0:
346; SSE-NEXT:    paddd %xmm3, %xmm1
347; SSE-NEXT:    paddd %xmm2, %xmm1
348; SSE-NEXT:    paddd %xmm0, %xmm1
349; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
350; SSE-NEXT:    paddd %xmm1, %xmm0
351; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
352; SSE-NEXT:    paddd %xmm0, %xmm1
353; SSE-NEXT:    movd %xmm1, %eax
354; SSE-NEXT:    retq
355;
356; AVX1-SLOW-LABEL: test_v16i32:
357; AVX1-SLOW:       # %bb.0:
358; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
359; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
360; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
361; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
362; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
363; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
364; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
365; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
366; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
367; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
368; AVX1-SLOW-NEXT:    vzeroupper
369; AVX1-SLOW-NEXT:    retq
370;
371; AVX1-FAST-LABEL: test_v16i32:
372; AVX1-FAST:       # %bb.0:
373; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
374; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm3
375; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
376; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
377; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
378; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
379; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
380; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
381; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
382; AVX1-FAST-NEXT:    vzeroupper
383; AVX1-FAST-NEXT:    retq
384;
385; AVX2-LABEL: test_v16i32:
386; AVX2:       # %bb.0:
387; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
388; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
389; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
390; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
391; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
392; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
393; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
394; AVX2-NEXT:    vmovd %xmm0, %eax
395; AVX2-NEXT:    vzeroupper
396; AVX2-NEXT:    retq
397;
398; AVX512-LABEL: test_v16i32:
399; AVX512:       # %bb.0:
400; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
401; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
402; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
403; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
404; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
405; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
406; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
407; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
408; AVX512-NEXT:    vmovd %xmm0, %eax
409; AVX512-NEXT:    vzeroupper
410; AVX512-NEXT:    retq
411  %1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a0)
412  ret i32 %1
413}
414
415define i32 @test_v32i32(<32 x i32> %a0) {
416; SSE-LABEL: test_v32i32:
417; SSE:       # %bb.0:
418; SSE-NEXT:    paddd %xmm6, %xmm2
419; SSE-NEXT:    paddd %xmm7, %xmm3
420; SSE-NEXT:    paddd %xmm5, %xmm3
421; SSE-NEXT:    paddd %xmm1, %xmm3
422; SSE-NEXT:    paddd %xmm4, %xmm2
423; SSE-NEXT:    paddd %xmm3, %xmm2
424; SSE-NEXT:    paddd %xmm0, %xmm2
425; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
426; SSE-NEXT:    paddd %xmm2, %xmm0
427; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
428; SSE-NEXT:    paddd %xmm0, %xmm1
429; SSE-NEXT:    movd %xmm1, %eax
430; SSE-NEXT:    retq
431;
432; AVX1-SLOW-LABEL: test_v32i32:
433; AVX1-SLOW:       # %bb.0:
434; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm1, %xmm4
435; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm3, %xmm3
436; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
437; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
438; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm2, %xmm3
439; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
440; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
441; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
442; AVX1-SLOW-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
443; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
444; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
445; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
446; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
447; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
448; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
449; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
450; AVX1-SLOW-NEXT:    vzeroupper
451; AVX1-SLOW-NEXT:    retq
452;
453; AVX1-FAST-LABEL: test_v32i32:
454; AVX1-FAST:       # %bb.0:
455; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm1, %xmm4
456; AVX1-FAST-NEXT:    vextractf128 $1, %ymm3, %xmm3
457; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
458; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
459; AVX1-FAST-NEXT:    vextractf128 $1, %ymm2, %xmm3
460; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
461; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm3
462; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
463; AVX1-FAST-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
464; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
465; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
466; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
467; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
468; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
469; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
470; AVX1-FAST-NEXT:    vzeroupper
471; AVX1-FAST-NEXT:    retq
472;
473; AVX2-LABEL: test_v32i32:
474; AVX2:       # %bb.0:
475; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
476; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
477; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
478; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
479; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
480; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
481; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
482; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
483; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
484; AVX2-NEXT:    vmovd %xmm0, %eax
485; AVX2-NEXT:    vzeroupper
486; AVX2-NEXT:    retq
487;
488; AVX512-LABEL: test_v32i32:
489; AVX512:       # %bb.0:
490; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
491; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
492; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
493; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
494; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
495; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
496; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
497; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
498; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
499; AVX512-NEXT:    vmovd %xmm0, %eax
500; AVX512-NEXT:    vzeroupper
501; AVX512-NEXT:    retq
502  %1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %a0)
503  ret i32 %1
504}
505
506;
507; vXi16
508;
509
510define i16 @test_v2i16(<2 x i16> %a0) {
511; SSE-LABEL: test_v2i16:
512; SSE:       # %bb.0:
513; SSE-NEXT:    movdqa %xmm0, %xmm1
514; SSE-NEXT:    psrld $16, %xmm1
515; SSE-NEXT:    paddw %xmm0, %xmm1
516; SSE-NEXT:    movd %xmm1, %eax
517; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
518; SSE-NEXT:    retq
519;
520; AVX1-SLOW-LABEL: test_v2i16:
521; AVX1-SLOW:       # %bb.0:
522; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
523; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
524; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
525; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
526; AVX1-SLOW-NEXT:    retq
527;
528; AVX1-FAST-LABEL: test_v2i16:
529; AVX1-FAST:       # %bb.0:
530; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
531; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
532; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
533; AVX1-FAST-NEXT:    retq
534;
535; AVX2-LABEL: test_v2i16:
536; AVX2:       # %bb.0:
537; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
538; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
539; AVX2-NEXT:    vmovd %xmm0, %eax
540; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
541; AVX2-NEXT:    retq
542;
543; AVX512-LABEL: test_v2i16:
544; AVX512:       # %bb.0:
545; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
546; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
547; AVX512-NEXT:    vmovd %xmm0, %eax
548; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
549; AVX512-NEXT:    retq
550  %1 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a0)
551  ret i16 %1
552}
553
554define i16 @test_v4i16(<4 x i16> %a0) {
555; SSE-LABEL: test_v4i16:
556; SSE:       # %bb.0:
557; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
558; SSE-NEXT:    paddw %xmm0, %xmm1
559; SSE-NEXT:    movdqa %xmm1, %xmm0
560; SSE-NEXT:    psrld $16, %xmm0
561; SSE-NEXT:    paddw %xmm1, %xmm0
562; SSE-NEXT:    movd %xmm0, %eax
563; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
564; SSE-NEXT:    retq
565;
566; AVX1-SLOW-LABEL: test_v4i16:
567; AVX1-SLOW:       # %bb.0:
568; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
569; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
570; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
571; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
572; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
573; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
574; AVX1-SLOW-NEXT:    retq
575;
576; AVX1-FAST-LABEL: test_v4i16:
577; AVX1-FAST:       # %bb.0:
578; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
579; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
580; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
581; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
582; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
583; AVX1-FAST-NEXT:    retq
584;
585; AVX2-LABEL: test_v4i16:
586; AVX2:       # %bb.0:
587; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
588; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
589; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
590; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
591; AVX2-NEXT:    vmovd %xmm0, %eax
592; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
593; AVX2-NEXT:    retq
594;
595; AVX512-LABEL: test_v4i16:
596; AVX512:       # %bb.0:
597; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
598; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
599; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
600; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
601; AVX512-NEXT:    vmovd %xmm0, %eax
602; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
603; AVX512-NEXT:    retq
604  %1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a0)
605  ret i16 %1
606}
607
608define i16 @test_v8i16(<8 x i16> %a0) {
609; SSE-LABEL: test_v8i16:
610; SSE:       # %bb.0:
611; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
612; SSE-NEXT:    paddw %xmm0, %xmm1
613; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
614; SSE-NEXT:    paddw %xmm1, %xmm0
615; SSE-NEXT:    movdqa %xmm0, %xmm1
616; SSE-NEXT:    psrld $16, %xmm1
617; SSE-NEXT:    paddw %xmm0, %xmm1
618; SSE-NEXT:    movd %xmm1, %eax
619; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
620; SSE-NEXT:    retq
621;
622; AVX1-SLOW-LABEL: test_v8i16:
623; AVX1-SLOW:       # %bb.0:
624; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
625; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
626; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
627; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
628; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
629; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
630; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
631; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
632; AVX1-SLOW-NEXT:    retq
633;
634; AVX1-FAST-LABEL: test_v8i16:
635; AVX1-FAST:       # %bb.0:
636; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
637; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
638; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
639; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
640; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
641; AVX1-FAST-NEXT:    retq
642;
643; AVX2-LABEL: test_v8i16:
644; AVX2:       # %bb.0:
645; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
646; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
647; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
648; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
649; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
650; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
651; AVX2-NEXT:    vmovd %xmm0, %eax
652; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
653; AVX2-NEXT:    retq
654;
655; AVX512-LABEL: test_v8i16:
656; AVX512:       # %bb.0:
657; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
658; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
659; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
660; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
661; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
662; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
663; AVX512-NEXT:    vmovd %xmm0, %eax
664; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
665; AVX512-NEXT:    retq
666  %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a0)
667  ret i16 %1
668}
669
670define i16 @test_v16i16(<16 x i16> %a0) {
671; SSE-LABEL: test_v16i16:
672; SSE:       # %bb.0:
673; SSE-NEXT:    paddw %xmm1, %xmm0
674; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
675; SSE-NEXT:    paddw %xmm0, %xmm1
676; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
677; SSE-NEXT:    paddw %xmm1, %xmm0
678; SSE-NEXT:    movdqa %xmm0, %xmm1
679; SSE-NEXT:    psrld $16, %xmm1
680; SSE-NEXT:    paddw %xmm0, %xmm1
681; SSE-NEXT:    movd %xmm1, %eax
682; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
683; SSE-NEXT:    retq
684;
685; AVX1-SLOW-LABEL: test_v16i16:
686; AVX1-SLOW:       # %bb.0:
687; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
688; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
689; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
690; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
691; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
692; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
693; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
694; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
695; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
696; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
697; AVX1-SLOW-NEXT:    vzeroupper
698; AVX1-SLOW-NEXT:    retq
699;
700; AVX1-FAST-LABEL: test_v16i16:
701; AVX1-FAST:       # %bb.0:
702; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
703; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm1, %xmm0
704; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
705; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
706; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
707; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
708; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
709; AVX1-FAST-NEXT:    vzeroupper
710; AVX1-FAST-NEXT:    retq
711;
712; AVX2-LABEL: test_v16i16:
713; AVX2:       # %bb.0:
714; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
715; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
716; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
717; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
718; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
719; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
720; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
721; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
722; AVX2-NEXT:    vmovd %xmm0, %eax
723; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
724; AVX2-NEXT:    vzeroupper
725; AVX2-NEXT:    retq
726;
727; AVX512-LABEL: test_v16i16:
728; AVX512:       # %bb.0:
729; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
730; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
731; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
732; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
733; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
734; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
735; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
736; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
737; AVX512-NEXT:    vmovd %xmm0, %eax
738; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
739; AVX512-NEXT:    vzeroupper
740; AVX512-NEXT:    retq
741  %1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a0)
742  ret i16 %1
743}
744
745define i16 @test_v32i16(<32 x i16> %a0) {
746; SSE-LABEL: test_v32i16:
747; SSE:       # %bb.0:
748; SSE-NEXT:    paddw %xmm3, %xmm1
749; SSE-NEXT:    paddw %xmm2, %xmm1
750; SSE-NEXT:    paddw %xmm0, %xmm1
751; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
752; SSE-NEXT:    paddw %xmm1, %xmm0
753; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
754; SSE-NEXT:    paddw %xmm0, %xmm1
755; SSE-NEXT:    movdqa %xmm1, %xmm0
756; SSE-NEXT:    psrld $16, %xmm0
757; SSE-NEXT:    paddw %xmm1, %xmm0
758; SSE-NEXT:    movd %xmm0, %eax
759; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
760; SSE-NEXT:    retq
761;
762; AVX1-SLOW-LABEL: test_v32i16:
763; AVX1-SLOW:       # %bb.0:
764; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
765; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
766; AVX1-SLOW-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
767; AVX1-SLOW-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
768; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
769; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
770; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
771; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
772; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
773; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
774; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
775; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
776; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
777; AVX1-SLOW-NEXT:    vzeroupper
778; AVX1-SLOW-NEXT:    retq
779;
780; AVX1-FAST-LABEL: test_v32i16:
781; AVX1-FAST:       # %bb.0:
782; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
783; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm3
784; AVX1-FAST-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
785; AVX1-FAST-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
786; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
787; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
788; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
789; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
790; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
791; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
792; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
793; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
794; AVX1-FAST-NEXT:    vzeroupper
795; AVX1-FAST-NEXT:    retq
796;
797; AVX2-LABEL: test_v32i16:
798; AVX2:       # %bb.0:
799; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
800; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
801; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
802; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
803; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
804; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
805; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
806; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
807; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
808; AVX2-NEXT:    vmovd %xmm0, %eax
809; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
810; AVX2-NEXT:    vzeroupper
811; AVX2-NEXT:    retq
812;
813; AVX512-LABEL: test_v32i16:
814; AVX512:       # %bb.0:
815; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
816; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
817; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
818; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
819; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
820; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
821; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
822; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
823; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
824; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
825; AVX512-NEXT:    vmovd %xmm0, %eax
826; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
827; AVX512-NEXT:    vzeroupper
828; AVX512-NEXT:    retq
829  %1 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %a0)
830  ret i16 %1
831}
832
833define i16 @test_v64i16(<64 x i16> %a0) {
834; SSE-LABEL: test_v64i16:
835; SSE:       # %bb.0:
836; SSE-NEXT:    paddw %xmm6, %xmm2
837; SSE-NEXT:    paddw %xmm7, %xmm3
838; SSE-NEXT:    paddw %xmm5, %xmm3
839; SSE-NEXT:    paddw %xmm1, %xmm3
840; SSE-NEXT:    paddw %xmm4, %xmm2
841; SSE-NEXT:    paddw %xmm3, %xmm2
842; SSE-NEXT:    paddw %xmm0, %xmm2
843; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
844; SSE-NEXT:    paddw %xmm2, %xmm0
845; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
846; SSE-NEXT:    paddw %xmm0, %xmm1
847; SSE-NEXT:    movdqa %xmm1, %xmm0
848; SSE-NEXT:    psrld $16, %xmm0
849; SSE-NEXT:    paddw %xmm1, %xmm0
850; SSE-NEXT:    movd %xmm0, %eax
851; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
852; SSE-NEXT:    retq
853;
854; AVX1-SLOW-LABEL: test_v64i16:
855; AVX1-SLOW:       # %bb.0:
856; AVX1-SLOW-NEXT:    vpaddw %xmm3, %xmm1, %xmm4
857; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm3, %xmm3
858; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
859; AVX1-SLOW-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
860; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm2, %xmm3
861; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
862; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
863; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
864; AVX1-SLOW-NEXT:    vpaddw %xmm4, %xmm2, %xmm2
865; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
866; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
867; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
868; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
869; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
870; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
871; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
872; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
873; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
874; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
875; AVX1-SLOW-NEXT:    vzeroupper
876; AVX1-SLOW-NEXT:    retq
877;
878; AVX1-FAST-LABEL: test_v64i16:
879; AVX1-FAST:       # %bb.0:
880; AVX1-FAST-NEXT:    vpaddw %xmm3, %xmm1, %xmm4
881; AVX1-FAST-NEXT:    vextractf128 $1, %ymm3, %xmm3
882; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
883; AVX1-FAST-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
884; AVX1-FAST-NEXT:    vextractf128 $1, %ymm2, %xmm3
885; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
886; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm3
887; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
888; AVX1-FAST-NEXT:    vpaddw %xmm4, %xmm2, %xmm2
889; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
890; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
891; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
892; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
893; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
894; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
895; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
896; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
897; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
898; AVX1-FAST-NEXT:    vzeroupper
899; AVX1-FAST-NEXT:    retq
900;
901; AVX2-LABEL: test_v64i16:
902; AVX2:       # %bb.0:
903; AVX2-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
904; AVX2-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
905; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
906; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
907; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
908; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
909; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
910; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
911; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
912; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
913; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
914; AVX2-NEXT:    vmovd %xmm0, %eax
915; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
916; AVX2-NEXT:    vzeroupper
917; AVX2-NEXT:    retq
918;
919; AVX512-LABEL: test_v64i16:
920; AVX512:       # %bb.0:
921; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
922; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
923; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
924; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
925; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
926; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
927; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
928; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
929; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
930; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
931; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
932; AVX512-NEXT:    vmovd %xmm0, %eax
933; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
934; AVX512-NEXT:    vzeroupper
935; AVX512-NEXT:    retq
936  %1 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %a0)
937  ret i16 %1
938}
939
940;
941; vXi8
942;
943
944define i8 @test_v2i8(<2 x i8> %a0) {
945; SSE-LABEL: test_v2i8:
946; SSE:       # %bb.0:
947; SSE-NEXT:    movdqa %xmm0, %xmm1
948; SSE-NEXT:    psrlw $8, %xmm1
949; SSE-NEXT:    paddb %xmm0, %xmm1
950; SSE-NEXT:    movd %xmm1, %eax
951; SSE-NEXT:    # kill: def $al killed $al killed $eax
952; SSE-NEXT:    retq
953;
954; AVX-LABEL: test_v2i8:
955; AVX:       # %bb.0:
956; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
957; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
958; AVX-NEXT:    vmovd %xmm0, %eax
959; AVX-NEXT:    # kill: def $al killed $al killed $eax
960; AVX-NEXT:    retq
961;
962; AVX512-LABEL: test_v2i8:
963; AVX512:       # %bb.0:
964; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
965; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
966; AVX512-NEXT:    vmovd %xmm0, %eax
967; AVX512-NEXT:    # kill: def $al killed $al killed $eax
968; AVX512-NEXT:    retq
969  %1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a0)
970  ret i8 %1
971}
972
973define i8 @test_v2i8_load(<2 x i8>* %p) {
974; SSE-LABEL: test_v2i8_load:
975; SSE:       # %bb.0:
976; SSE-NEXT:    movzwl (%rdi), %eax
977; SSE-NEXT:    movd %eax, %xmm0
978; SSE-NEXT:    movdqa %xmm0, %xmm1
979; SSE-NEXT:    psrlw $8, %xmm1
980; SSE-NEXT:    paddb %xmm0, %xmm1
981; SSE-NEXT:    movd %xmm1, %eax
982; SSE-NEXT:    # kill: def $al killed $al killed $eax
983; SSE-NEXT:    retq
984;
985; AVX-LABEL: test_v2i8_load:
986; AVX:       # %bb.0:
987; AVX-NEXT:    movzwl (%rdi), %eax
988; AVX-NEXT:    vmovd %eax, %xmm0
989; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
990; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
991; AVX-NEXT:    vmovd %xmm0, %eax
992; AVX-NEXT:    # kill: def $al killed $al killed $eax
993; AVX-NEXT:    retq
994;
995; AVX512-LABEL: test_v2i8_load:
996; AVX512:       # %bb.0:
997; AVX512-NEXT:    movzwl (%rdi), %eax
998; AVX512-NEXT:    vmovd %eax, %xmm0
999; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
1000; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1001; AVX512-NEXT:    vmovd %xmm0, %eax
1002; AVX512-NEXT:    # kill: def $al killed $al killed $eax
1003; AVX512-NEXT:    retq
1004  %a0 = load <2 x i8>, <2 x i8>* %p
1005  %1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a0)
1006  ret i8 %1
1007}
1008
1009define i8 @test_v4i8(<4 x i8> %a0) {
1010; SSE2-LABEL: test_v4i8:
1011; SSE2:       # %bb.0:
1012; SSE2-NEXT:    pxor %xmm1, %xmm1
1013; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1014; SSE2-NEXT:    psadbw %xmm1, %xmm0
1015; SSE2-NEXT:    movd %xmm0, %eax
1016; SSE2-NEXT:    # kill: def $al killed $al killed $eax
1017; SSE2-NEXT:    retq
1018;
1019; SSE41-LABEL: test_v4i8:
1020; SSE41:       # %bb.0:
1021; SSE41-NEXT:    pxor %xmm1, %xmm1
1022; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1023; SSE41-NEXT:    psadbw %xmm1, %xmm0
1024; SSE41-NEXT:    movd %xmm0, %eax
1025; SSE41-NEXT:    # kill: def $al killed $al killed $eax
1026; SSE41-NEXT:    retq
1027;
1028; AVX-LABEL: test_v4i8:
1029; AVX:       # %bb.0:
1030; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1031; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1032; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1033; AVX-NEXT:    vmovd %xmm0, %eax
1034; AVX-NEXT:    # kill: def $al killed $al killed $eax
1035; AVX-NEXT:    retq
1036;
1037; AVX512-LABEL: test_v4i8:
1038; AVX512:       # %bb.0:
1039; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1040; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1041; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1042; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1043; AVX512-NEXT:    vmovd %xmm0, %eax
1044; AVX512-NEXT:    # kill: def $al killed $al killed $eax
1045; AVX512-NEXT:    retq
1046  %1 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a0)
1047  ret i8 %1
1048}
1049
1050define i8 @test_v4i8_load(<4 x i8>* %p) {
1051; SSE-LABEL: test_v4i8_load:
1052; SSE:       # %bb.0:
1053; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1054; SSE-NEXT:    pxor %xmm1, %xmm1
1055; SSE-NEXT:    psadbw %xmm0, %xmm1
1056; SSE-NEXT:    movd %xmm1, %eax
1057; SSE-NEXT:    # kill: def $al killed $al killed $eax
1058; SSE-NEXT:    retq
1059;
1060; AVX-LABEL: test_v4i8_load:
1061; AVX:       # %bb.0:
1062; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1063; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1064; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1065; AVX-NEXT:    vmovd %xmm0, %eax
1066; AVX-NEXT:    # kill: def $al killed $al killed $eax
1067; AVX-NEXT:    retq
1068;
1069; AVX512-LABEL: test_v4i8_load:
1070; AVX512:       # %bb.0:
1071; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1072; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1073; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1074; AVX512-NEXT:    vmovd %xmm0, %eax
1075; AVX512-NEXT:    # kill: def $al killed $al killed $eax
1076; AVX512-NEXT:    retq
1077  %a0 = load <4 x i8>, <4 x i8>* %p
1078  %1 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a0)
1079  ret i8 %1
1080}
1081
1082define i8 @test_v8i8(<8 x i8> %a0) {
1083; SSE-LABEL: test_v8i8:
1084; SSE:       # %bb.0:
1085; SSE-NEXT:    pxor %xmm1, %xmm1
1086; SSE-NEXT:    psadbw %xmm0, %xmm1
1087; SSE-NEXT:    movd %xmm1, %eax
1088; SSE-NEXT:    # kill: def $al killed $al killed $eax
1089; SSE-NEXT:    retq
1090;
1091; AVX-LABEL: test_v8i8:
1092; AVX:       # %bb.0:
1093; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1094; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1095; AVX-NEXT:    vmovd %xmm0, %eax
1096; AVX-NEXT:    # kill: def $al killed $al killed $eax
1097; AVX-NEXT:    retq
1098;
1099; AVX512-LABEL: test_v8i8:
1100; AVX512:       # %bb.0:
1101; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1102; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1103; AVX512-NEXT:    vmovd %xmm0, %eax
1104; AVX512-NEXT:    # kill: def $al killed $al killed $eax
1105; AVX512-NEXT:    retq
1106  %1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a0)
1107  ret i8 %1
1108}
1109
1110define i8 @test_v8i8_load(<8 x i8>* %p) {
1111; SSE-LABEL: test_v8i8_load:
1112; SSE:       # %bb.0:
1113; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
1114; SSE-NEXT:    pxor %xmm1, %xmm1
1115; SSE-NEXT:    psadbw %xmm0, %xmm1
1116; SSE-NEXT:    movd %xmm1, %eax
1117; SSE-NEXT:    # kill: def $al killed $al killed $eax
1118; SSE-NEXT:    retq
1119;
1120; AVX-LABEL: test_v8i8_load:
1121; AVX:       # %bb.0:
1122; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1123; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1124; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1125; AVX-NEXT:    vmovd %xmm0, %eax
1126; AVX-NEXT:    # kill: def $al killed $al killed $eax
1127; AVX-NEXT:    retq
1128;
1129; AVX512-LABEL: test_v8i8_load:
1130; AVX512:       # %bb.0:
1131; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1132; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1133; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1134; AVX512-NEXT:    vmovd %xmm0, %eax
1135; AVX512-NEXT:    # kill: def $al killed $al killed $eax
1136; AVX512-NEXT:    retq
1137  %a0 = load <8 x i8>, <8 x i8>* %p
1138  %1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a0)
1139  ret i8 %1
1140}
1141
1142define i8 @test_v16i8(<16 x i8> %a0) {
1143; SSE-LABEL: test_v16i8:
1144; SSE:       # %bb.0:
1145; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1146; SSE-NEXT:    paddb %xmm0, %xmm1
1147; SSE-NEXT:    pxor %xmm0, %xmm0
1148; SSE-NEXT:    psadbw %xmm1, %xmm0
1149; SSE-NEXT:    movd %xmm0, %eax
1150; SSE-NEXT:    # kill: def $al killed $al killed $eax
1151; SSE-NEXT:    retq
1152;
1153; AVX-LABEL: test_v16i8:
1154; AVX:       # %bb.0:
1155; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1156; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1157; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1158; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1159; AVX-NEXT:    vmovd %xmm0, %eax
1160; AVX-NEXT:    # kill: def $al killed $al killed $eax
1161; AVX-NEXT:    retq
1162;
1163; AVX512-LABEL: test_v16i8:
1164; AVX512:       # %bb.0:
1165; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1166; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1167; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1168; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1169; AVX512-NEXT:    vmovd %xmm0, %eax
1170; AVX512-NEXT:    # kill: def $al killed $al killed $eax
1171; AVX512-NEXT:    retq
1172  %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a0)
1173  ret i8 %1
1174}
1175
1176define i8 @test_v32i8(<32 x i8> %a0) {
1177; SSE-LABEL: test_v32i8:
1178; SSE:       # %bb.0:
1179; SSE-NEXT:    paddb %xmm1, %xmm0
1180; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1181; SSE-NEXT:    paddb %xmm0, %xmm1
1182; SSE-NEXT:    pxor %xmm0, %xmm0
1183; SSE-NEXT:    psadbw %xmm1, %xmm0
1184; SSE-NEXT:    movd %xmm0, %eax
1185; SSE-NEXT:    # kill: def $al killed $al killed $eax
1186; SSE-NEXT:    retq
1187;
1188; AVX1-LABEL: test_v32i8:
1189; AVX1:       # %bb.0:
1190; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1191; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1192; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1193; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1194; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1195; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1196; AVX1-NEXT:    vmovd %xmm0, %eax
1197; AVX1-NEXT:    # kill: def $al killed $al killed $eax
1198; AVX1-NEXT:    vzeroupper
1199; AVX1-NEXT:    retq
1200;
1201; AVX2-LABEL: test_v32i8:
1202; AVX2:       # %bb.0:
1203; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1204; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1205; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1206; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1207; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1208; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1209; AVX2-NEXT:    vmovd %xmm0, %eax
1210; AVX2-NEXT:    # kill: def $al killed $al killed $eax
1211; AVX2-NEXT:    vzeroupper
1212; AVX2-NEXT:    retq
1213;
1214; AVX512-LABEL: test_v32i8:
1215; AVX512:       # %bb.0:
1216; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1217; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1218; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1219; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1220; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1221; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1222; AVX512-NEXT:    vmovd %xmm0, %eax
1223; AVX512-NEXT:    # kill: def $al killed $al killed $eax
1224; AVX512-NEXT:    vzeroupper
1225; AVX512-NEXT:    retq
1226  %1 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %a0)
1227  ret i8 %1
1228}
1229
1230define i8 @test_v64i8(<64 x i8> %a0) {
1231; SSE-LABEL: test_v64i8:
1232; SSE:       # %bb.0:
1233; SSE-NEXT:    paddb %xmm3, %xmm1
1234; SSE-NEXT:    paddb %xmm2, %xmm1
1235; SSE-NEXT:    paddb %xmm0, %xmm1
1236; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1237; SSE-NEXT:    paddb %xmm1, %xmm0
1238; SSE-NEXT:    pxor %xmm1, %xmm1
1239; SSE-NEXT:    psadbw %xmm0, %xmm1
1240; SSE-NEXT:    movd %xmm1, %eax
1241; SSE-NEXT:    # kill: def $al killed $al killed $eax
1242; SSE-NEXT:    retq
1243;
1244; AVX1-LABEL: test_v64i8:
1245; AVX1:       # %bb.0:
1246; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1247; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1248; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
1249; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
1250; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1251; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1252; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1253; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1254; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1255; AVX1-NEXT:    vmovd %xmm0, %eax
1256; AVX1-NEXT:    # kill: def $al killed $al killed $eax
1257; AVX1-NEXT:    vzeroupper
1258; AVX1-NEXT:    retq
1259;
1260; AVX2-LABEL: test_v64i8:
1261; AVX2:       # %bb.0:
1262; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
1263; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1264; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1265; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1266; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1267; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1268; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1269; AVX2-NEXT:    vmovd %xmm0, %eax
1270; AVX2-NEXT:    # kill: def $al killed $al killed $eax
1271; AVX2-NEXT:    vzeroupper
1272; AVX2-NEXT:    retq
1273;
1274; AVX512-LABEL: test_v64i8:
1275; AVX512:       # %bb.0:
1276; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1277; AVX512-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
1278; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1279; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1280; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1281; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1282; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1283; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1284; AVX512-NEXT:    vmovd %xmm0, %eax
1285; AVX512-NEXT:    # kill: def $al killed $al killed $eax
1286; AVX512-NEXT:    vzeroupper
1287; AVX512-NEXT:    retq
1288  %1 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %a0)
1289  ret i8 %1
1290}
1291
1292define i8 @test_v128i8(<128 x i8> %a0) {
1293; SSE-LABEL: test_v128i8:
1294; SSE:       # %bb.0:
1295; SSE-NEXT:    paddb %xmm7, %xmm3
1296; SSE-NEXT:    paddb %xmm5, %xmm3
1297; SSE-NEXT:    paddb %xmm1, %xmm3
1298; SSE-NEXT:    paddb %xmm6, %xmm2
1299; SSE-NEXT:    paddb %xmm4, %xmm2
1300; SSE-NEXT:    paddb %xmm3, %xmm2
1301; SSE-NEXT:    paddb %xmm0, %xmm2
1302; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
1303; SSE-NEXT:    paddb %xmm2, %xmm0
1304; SSE-NEXT:    pxor %xmm1, %xmm1
1305; SSE-NEXT:    psadbw %xmm0, %xmm1
1306; SSE-NEXT:    movd %xmm1, %eax
1307; SSE-NEXT:    # kill: def $al killed $al killed $eax
1308; SSE-NEXT:    retq
1309;
1310; AVX1-LABEL: test_v128i8:
1311; AVX1:       # %bb.0:
1312; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
1313; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
1314; AVX1-NEXT:    vpaddb %xmm4, %xmm5, %xmm4
1315; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
1316; AVX1-NEXT:    vpaddb %xmm4, %xmm5, %xmm4
1317; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
1318; AVX1-NEXT:    vpaddb %xmm4, %xmm5, %xmm4
1319; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
1320; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
1321; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
1322; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1323; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1324; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1325; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1326; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1327; AVX1-NEXT:    vmovd %xmm0, %eax
1328; AVX1-NEXT:    # kill: def $al killed $al killed $eax
1329; AVX1-NEXT:    vzeroupper
1330; AVX1-NEXT:    retq
1331;
1332; AVX2-LABEL: test_v128i8:
1333; AVX2:       # %bb.0:
1334; AVX2-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
1335; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
1336; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
1337; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1338; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1339; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1340; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1341; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1342; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1343; AVX2-NEXT:    vmovd %xmm0, %eax
1344; AVX2-NEXT:    # kill: def $al killed $al killed $eax
1345; AVX2-NEXT:    vzeroupper
1346; AVX2-NEXT:    retq
1347;
1348; AVX512-LABEL: test_v128i8:
1349; AVX512:       # %bb.0:
1350; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
1351; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1352; AVX512-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
1353; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1354; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1355; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1356; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1357; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1358; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
1359; AVX512-NEXT:    vmovd %xmm0, %eax
1360; AVX512-NEXT:    # kill: def $al killed $al killed $eax
1361; AVX512-NEXT:    vzeroupper
1362; AVX512-NEXT:    retq
1363  %1 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %a0)
1364  ret i8 %1
1365}
1366
1367declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
1368declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
1369declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
1370declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
1371
1372declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
1373declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
1374declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
1375declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
1376declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
1377
1378declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
1379declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
1380declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
1381declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
1382declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
1383declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
1384
1385declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
1386declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
1387declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
1388declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
1389declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
1390declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
1391declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
1392