1; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
2; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
3; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
4; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41
5; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42
6; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=AVX1
7; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
8; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
9; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
10; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
11
12define i32 @reduce_i64(i32 %arg) {
13; SSE2-LABEL: 'reduce_i64'
14; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
15; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
16; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
17; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
18; SSE2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
19; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
20;
21; SSSE3-LABEL: 'reduce_i64'
22; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
23; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
24; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
25; SSSE3-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
26; SSSE3-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
27; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
28;
29; SSE41-LABEL: 'reduce_i64'
30; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
31; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
32; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
33; SSE41-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
34; SSE41-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
35; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
36;
37; SSE42-LABEL: 'reduce_i64'
38; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
39; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
40; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
41; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
42; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
43; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
44;
45; AVX1-LABEL: 'reduce_i64'
46; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
47; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
48; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
49; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
50; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
51; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
52;
53; AVX2-LABEL: 'reduce_i64'
54; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
55; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
56; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
57; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
58; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
59; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
60;
61; AVX512-LABEL: 'reduce_i64'
62; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
63; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
64; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
65; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
66; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
67; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
68;
69  %V1  = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef)
70  %V2  = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
71  %V4  = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
72  %V8  = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
73  %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
74  ret i32 undef
75}
76
77define i32 @reduce_i32(i32 %arg) {
78; SSE2-LABEL: 'reduce_i32'
79; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
80; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
81; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
82; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
83; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
84; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
85;
86; SSSE3-LABEL: 'reduce_i32'
87; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
88; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
89; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
90; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
91; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
92; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
93;
94; SSE4-LABEL: 'reduce_i32'
95; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
96; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
97; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
98; SSE4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
99; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
100; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
101;
102; AVX1-LABEL: 'reduce_i32'
103; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
104; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
105; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
106; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
107; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
108; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
109;
110; AVX2-LABEL: 'reduce_i32'
111; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
112; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
113; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
114; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
115; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
116; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
117;
118; AVX512-LABEL: 'reduce_i32'
119; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
120; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
121; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
122; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
123; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
124; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
125;
126  %V2  = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
127  %V4  = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
128  %V8  = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
129  %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
130  %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
131  ret i32 undef
132}
133
134define i32 @reduce_i16(i32 %arg) {
135; SSE2-LABEL: 'reduce_i16'
136; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
137; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
138; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
139; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
140; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
141; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
142; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
143;
144; SSSE3-LABEL: 'reduce_i16'
145; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
146; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
147; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
148; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
149; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
150; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
151; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
152;
153; SSE4-LABEL: 'reduce_i16'
154; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
155; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
156; SSE4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
157; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
158; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
159; SSE4-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
160; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
161;
162; AVX1-LABEL: 'reduce_i16'
163; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
164; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
165; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
166; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
167; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
168; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
169; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
170;
171; AVX2-LABEL: 'reduce_i16'
172; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
173; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
174; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
175; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
176; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
177; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
178; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
179;
180; AVX512F-LABEL: 'reduce_i16'
181; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
182; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
183; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
184; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
185; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
186; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
187; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
188;
189; AVX512BW-LABEL: 'reduce_i16'
190; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
191; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
192; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
193; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
194; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
195; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
196; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
197;
198; AVX512DQ-LABEL: 'reduce_i16'
199; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
200; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
201; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
202; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
203; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
204; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
205; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
206;
207  %V2  = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
208  %V4  = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
209  %V8  = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
210  %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
211  %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
212  %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
213  ret i32 undef
214}
215
216define i32 @reduce_i8(i32 %arg) {
217; SSE2-LABEL: 'reduce_i8'
218; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
219; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
220; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
221; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
222; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
223; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
224; SSE2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
225; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
226;
227; SSSE3-LABEL: 'reduce_i8'
228; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
229; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
230; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
231; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
232; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
233; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
234; SSSE3-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
235; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
236;
237; SSE4-LABEL: 'reduce_i8'
238; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
239; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
240; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
241; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
242; SSE4-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
243; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
244; SSE4-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
245; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
246;
247; AVX1-LABEL: 'reduce_i8'
248; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
249; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
250; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
251; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
252; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
253; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
254; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
255; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
256;
257; AVX2-LABEL: 'reduce_i8'
258; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
259; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
260; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
261; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
262; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
263; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
264; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
265; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
266;
267; AVX512F-LABEL: 'reduce_i8'
268; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
269; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
270; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
271; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
272; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
273; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
274; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
275; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
276;
277; AVX512BW-LABEL: 'reduce_i8'
278; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
279; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
280; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
281; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
282; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
283; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
284; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
285; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
286;
287; AVX512DQ-LABEL: 'reduce_i8'
288; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
289; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
290; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
291; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
292; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
293; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
294; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
295; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
296;
297  %V2   = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
298  %V4   = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
299  %V8   = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
300  %V16  = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
301  %V32  = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
302  %V64  = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
303  %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
304  ret i32 undef
305}
306
307declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>)
308declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
309declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
310declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>)
311declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>)
312
313declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>)
314declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
315declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
316declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>)
317declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>)
318
319declare i16 @llvm.vector.reduce.smax.v2i16(<2 x i16>)
320declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>)
321declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
322declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>)
323declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>)
324declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>)
325
326declare i8 @llvm.vector.reduce.smax.v2i8(<2 x i8>)
327declare i8 @llvm.vector.reduce.smax.v4i8(<4 x i8>)
328declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>)
329declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
330declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>)
331declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>)
332declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)
333