• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
3; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
4; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
5; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
6; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
7; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
8
9;
10; 128-bit vectors
11;
12
13define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b) {
14; SSE-LABEL: @test_v2f64(
15; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
16; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
17; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
18; SSE-NEXT:    ret <2 x double> [[TMP3]]
19;
20; SLM-LABEL: @test_v2f64(
21; SLM-NEXT:    [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
22; SLM-NEXT:    [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1
23; SLM-NEXT:    [[B0:%.*]] = extractelement <2 x double> [[B:%.*]], i32 0
24; SLM-NEXT:    [[B1:%.*]] = extractelement <2 x double> [[B]], i32 1
25; SLM-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
26; SLM-NEXT:    [[R1:%.*]] = fadd double [[B0]], [[B1]]
27; SLM-NEXT:    [[R00:%.*]] = insertelement <2 x double> undef, double [[R0]], i32 0
28; SLM-NEXT:    [[R01:%.*]] = insertelement <2 x double> [[R00]], double [[R1]], i32 1
29; SLM-NEXT:    ret <2 x double> [[R01]]
30;
31; AVX-LABEL: @test_v2f64(
32; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
33; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
34; AVX-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
35; AVX-NEXT:    ret <2 x double> [[TMP3]]
36;
37; AVX512-LABEL: @test_v2f64(
38; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
39; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
40; AVX512-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
41; AVX512-NEXT:    ret <2 x double> [[TMP3]]
42;
43  %a0 = extractelement <2 x double> %a, i32 0
44  %a1 = extractelement <2 x double> %a, i32 1
45  %b0 = extractelement <2 x double> %b, i32 0
46  %b1 = extractelement <2 x double> %b, i32 1
47  %r0 = fadd double %a0, %a1
48  %r1 = fadd double %b0, %b1
49  %r00 = insertelement <2 x double> undef, double %r0, i32 0
50  %r01 = insertelement <2 x double>  %r00, double %r1, i32 1
51  ret <2 x double> %r01
52}
53
54define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b) {
55; CHECK-LABEL: @test_v4f32(
56; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
57; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
58; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
59; CHECK-NEXT:    ret <4 x float> [[TMP3]]
60;
61  %a0 = extractelement <4 x float> %a, i32 0
62  %a1 = extractelement <4 x float> %a, i32 1
63  %a2 = extractelement <4 x float> %a, i32 2
64  %a3 = extractelement <4 x float> %a, i32 3
65  %b0 = extractelement <4 x float> %b, i32 0
66  %b1 = extractelement <4 x float> %b, i32 1
67  %b2 = extractelement <4 x float> %b, i32 2
68  %b3 = extractelement <4 x float> %b, i32 3
69  %r0 = fadd float %a0, %a1
70  %r1 = fadd float %a2, %a3
71  %r2 = fadd float %b0, %b1
72  %r3 = fadd float %b2, %b3
73  %r00 = insertelement <4 x float> undef, float %r0, i32 0
74  %r01 = insertelement <4 x float>  %r00, float %r1, i32 1
75  %r02 = insertelement <4 x float>  %r01, float %r2, i32 2
76  %r03 = insertelement <4 x float>  %r02, float %r3, i32 3
77  ret <4 x float> %r03
78}
79
80define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) {
81; SSE-LABEL: @test_v2i64(
82; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
83; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
84; SSE-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
85; SSE-NEXT:    ret <2 x i64> [[TMP3]]
86;
87; SLM-LABEL: @test_v2i64(
88; SLM-NEXT:    [[A0:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0
89; SLM-NEXT:    [[A1:%.*]] = extractelement <2 x i64> [[A]], i32 1
90; SLM-NEXT:    [[B0:%.*]] = extractelement <2 x i64> [[B:%.*]], i32 0
91; SLM-NEXT:    [[B1:%.*]] = extractelement <2 x i64> [[B]], i32 1
92; SLM-NEXT:    [[R0:%.*]] = add i64 [[A0]], [[A1]]
93; SLM-NEXT:    [[R1:%.*]] = add i64 [[B0]], [[B1]]
94; SLM-NEXT:    [[R00:%.*]] = insertelement <2 x i64> undef, i64 [[R0]], i32 0
95; SLM-NEXT:    [[R01:%.*]] = insertelement <2 x i64> [[R00]], i64 [[R1]], i32 1
96; SLM-NEXT:    ret <2 x i64> [[R01]]
97;
98; AVX-LABEL: @test_v2i64(
99; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
100; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
101; AVX-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
102; AVX-NEXT:    ret <2 x i64> [[TMP3]]
103;
104; AVX512-LABEL: @test_v2i64(
105; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
106; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
107; AVX512-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
108; AVX512-NEXT:    ret <2 x i64> [[TMP3]]
109;
110  %a0 = extractelement <2 x i64> %a, i32 0
111  %a1 = extractelement <2 x i64> %a, i32 1
112  %b0 = extractelement <2 x i64> %b, i32 0
113  %b1 = extractelement <2 x i64> %b, i32 1
114  %r0 = add i64 %a0, %a1
115  %r1 = add i64 %b0, %b1
116  %r00 = insertelement <2 x i64> undef, i64 %r0, i32 0
117  %r01 = insertelement <2 x i64>  %r00, i64 %r1, i32 1
118  ret <2 x i64> %r01
119}
120
121define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) {
122; CHECK-LABEL: @test_v4i32(
123; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
124; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
125; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
126; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
127;
128  %a0 = extractelement <4 x i32> %a, i32 0
129  %a1 = extractelement <4 x i32> %a, i32 1
130  %a2 = extractelement <4 x i32> %a, i32 2
131  %a3 = extractelement <4 x i32> %a, i32 3
132  %b0 = extractelement <4 x i32> %b, i32 0
133  %b1 = extractelement <4 x i32> %b, i32 1
134  %b2 = extractelement <4 x i32> %b, i32 2
135  %b3 = extractelement <4 x i32> %b, i32 3
136  %r0 = add i32 %a0, %a1
137  %r1 = add i32 %a2, %a3
138  %r2 = add i32 %b0, %b1
139  %r3 = add i32 %b2, %b3
140  %r00 = insertelement <4 x i32> undef, i32 %r0, i32 0
141  %r01 = insertelement <4 x i32>  %r00, i32 %r1, i32 1
142  %r02 = insertelement <4 x i32>  %r01, i32 %r2, i32 2
143  %r03 = insertelement <4 x i32>  %r02, i32 %r3, i32 3
144  ret <4 x i32> %r03
145}
146
147define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
148; CHECK-LABEL: @test_v8i16(
149; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
150; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
151; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
152; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
153;
154  %a0 = extractelement <8 x i16> %a, i32 0
155  %a1 = extractelement <8 x i16> %a, i32 1
156  %a2 = extractelement <8 x i16> %a, i32 2
157  %a3 = extractelement <8 x i16> %a, i32 3
158  %a4 = extractelement <8 x i16> %a, i32 4
159  %a5 = extractelement <8 x i16> %a, i32 5
160  %a6 = extractelement <8 x i16> %a, i32 6
161  %a7 = extractelement <8 x i16> %a, i32 7
162  %b0 = extractelement <8 x i16> %b, i32 0
163  %b1 = extractelement <8 x i16> %b, i32 1
164  %b2 = extractelement <8 x i16> %b, i32 2
165  %b3 = extractelement <8 x i16> %b, i32 3
166  %b4 = extractelement <8 x i16> %b, i32 4
167  %b5 = extractelement <8 x i16> %b, i32 5
168  %b6 = extractelement <8 x i16> %b, i32 6
169  %b7 = extractelement <8 x i16> %b, i32 7
170  %r0 = add i16 %a0, %a1
171  %r1 = add i16 %a2, %a3
172  %r2 = add i16 %a4, %a5
173  %r3 = add i16 %a6, %a7
174  %r4 = add i16 %b0, %b1
175  %r5 = add i16 %b2, %b3
176  %r6 = add i16 %b4, %b5
177  %r7 = add i16 %b6, %b7
178  %r00 = insertelement <8 x i16> undef, i16 %r0, i32 0
179  %r01 = insertelement <8 x i16>  %r00, i16 %r1, i32 1
180  %r02 = insertelement <8 x i16>  %r01, i16 %r2, i32 2
181  %r03 = insertelement <8 x i16>  %r02, i16 %r3, i32 3
182  %r04 = insertelement <8 x i16>  %r03, i16 %r4, i32 4
183  %r05 = insertelement <8 x i16>  %r04, i16 %r5, i32 5
184  %r06 = insertelement <8 x i16>  %r05, i16 %r6, i32 6
185  %r07 = insertelement <8 x i16>  %r06, i16 %r7, i32 7
186  ret <8 x i16> %r07
187}
188
189;
190; 256-bit vectors
191;
192
193define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
194; SSE-LABEL: @test_v4f64(
195; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
196; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
197; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
198; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
199; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
200; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
201; SSE-NEXT:    [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
202; SSE-NEXT:    ret <4 x double> [[R03]]
203;
204; SLM-LABEL: @test_v4f64(
205; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
206; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
207; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
208; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
209; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
210; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
211; SLM-NEXT:    [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
212; SLM-NEXT:    ret <4 x double> [[R03]]
213;
214; AVX-LABEL: @test_v4f64(
215; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
216; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
217; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
218; AVX-NEXT:    ret <4 x double> [[TMP3]]
219;
220; AVX512-LABEL: @test_v4f64(
221; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
222; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
223; AVX512-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
224; AVX512-NEXT:    ret <4 x double> [[TMP3]]
225;
226  %a0 = extractelement <4 x double> %a, i32 0
227  %a1 = extractelement <4 x double> %a, i32 1
228  %a2 = extractelement <4 x double> %a, i32 2
229  %a3 = extractelement <4 x double> %a, i32 3
230  %b0 = extractelement <4 x double> %b, i32 0
231  %b1 = extractelement <4 x double> %b, i32 1
232  %b2 = extractelement <4 x double> %b, i32 2
233  %b3 = extractelement <4 x double> %b, i32 3
234  %r0 = fadd double %a0, %a1
235  %r1 = fadd double %b0, %b1
236  %r2 = fadd double %a2, %a3
237  %r3 = fadd double %b2, %b3
238  %r00 = insertelement <4 x double> undef, double %r0, i32 0
239  %r01 = insertelement <4 x double>  %r00, double %r1, i32 1
240  %r02 = insertelement <4 x double>  %r01, double %r2, i32 2
241  %r03 = insertelement <4 x double>  %r02, double %r3, i32 3
242  ret <4 x double> %r03
243}
244
245define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
246; SSE-LABEL: @test_v8f32(
247; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
248; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
249; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
250; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
251; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
252; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]]
253; SSE-NEXT:    [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
254; SSE-NEXT:    ret <8 x float> [[R07]]
255;
256; SLM-LABEL: @test_v8f32(
257; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
258; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
259; SLM-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
260; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
261; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
262; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]]
263; SLM-NEXT:    [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
264; SLM-NEXT:    ret <8 x float> [[R07]]
265;
266; AVX-LABEL: @test_v8f32(
267; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
268; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
269; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
270; AVX-NEXT:    ret <8 x float> [[TMP3]]
271;
272; AVX512-LABEL: @test_v8f32(
273; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
274; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
275; AVX512-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
276; AVX512-NEXT:    ret <8 x float> [[TMP3]]
277;
278  %a0 = extractelement <8 x float> %a, i32 0
279  %a1 = extractelement <8 x float> %a, i32 1
280  %a2 = extractelement <8 x float> %a, i32 2
281  %a3 = extractelement <8 x float> %a, i32 3
282  %a4 = extractelement <8 x float> %a, i32 4
283  %a5 = extractelement <8 x float> %a, i32 5
284  %a6 = extractelement <8 x float> %a, i32 6
285  %a7 = extractelement <8 x float> %a, i32 7
286  %b0 = extractelement <8 x float> %b, i32 0
287  %b1 = extractelement <8 x float> %b, i32 1
288  %b2 = extractelement <8 x float> %b, i32 2
289  %b3 = extractelement <8 x float> %b, i32 3
290  %b4 = extractelement <8 x float> %b, i32 4
291  %b5 = extractelement <8 x float> %b, i32 5
292  %b6 = extractelement <8 x float> %b, i32 6
293  %b7 = extractelement <8 x float> %b, i32 7
294  %r0 = fadd float %a0, %a1
295  %r1 = fadd float %a2, %a3
296  %r2 = fadd float %b0, %b1
297  %r3 = fadd float %b2, %b3
298  %r4 = fadd float %a4, %a5
299  %r5 = fadd float %a6, %a7
300  %r6 = fadd float %b4, %b5
301  %r7 = fadd float %b6, %b7
302  %r00 = insertelement <8 x float> undef, float %r0, i32 0
303  %r01 = insertelement <8 x float>  %r00, float %r1, i32 1
304  %r02 = insertelement <8 x float>  %r01, float %r2, i32 2
305  %r03 = insertelement <8 x float>  %r02, float %r3, i32 3
306  %r04 = insertelement <8 x float>  %r03, float %r4, i32 4
307  %r05 = insertelement <8 x float>  %r04, float %r5, i32 5
308  %r06 = insertelement <8 x float>  %r05, float %r6, i32 6
309  %r07 = insertelement <8 x float>  %r06, float %r7, i32 7
310  ret <8 x float> %r07
311}
312
313define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
314; SSE-LABEL: @test_v4i64(
315; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
316; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
317; SSE-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
318; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
319; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
320; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]]
321; SSE-NEXT:    [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
322; SSE-NEXT:    ret <4 x i64> [[R03]]
323;
324; SLM-LABEL: @test_v4i64(
325; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
326; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
327; SLM-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
328; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
329; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
330; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]]
331; SLM-NEXT:    [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
332; SLM-NEXT:    ret <4 x i64> [[R03]]
333;
334; AVX-LABEL: @test_v4i64(
335; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
336; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
337; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
338; AVX-NEXT:    ret <4 x i64> [[TMP3]]
339;
340; AVX512-LABEL: @test_v4i64(
341; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
342; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
343; AVX512-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
344; AVX512-NEXT:    ret <4 x i64> [[TMP3]]
345;
346  %a0 = extractelement <4 x i64> %a, i32 0
347  %a1 = extractelement <4 x i64> %a, i32 1
348  %a2 = extractelement <4 x i64> %a, i32 2
349  %a3 = extractelement <4 x i64> %a, i32 3
350  %b0 = extractelement <4 x i64> %b, i32 0
351  %b1 = extractelement <4 x i64> %b, i32 1
352  %b2 = extractelement <4 x i64> %b, i32 2
353  %b3 = extractelement <4 x i64> %b, i32 3
354  %r0 = add i64 %a0, %a1
355  %r1 = add i64 %b0, %b1
356  %r2 = add i64 %a2, %a3
357  %r3 = add i64 %b2, %b3
358  %r00 = insertelement <4 x i64> undef, i64 %r0, i32 0
359  %r01 = insertelement <4 x i64>  %r00, i64 %r1, i32 1
360  %r02 = insertelement <4 x i64>  %r01, i64 %r2, i32 2
361  %r03 = insertelement <4 x i64>  %r02, i64 %r3, i32 3
362  ret <4 x i64> %r03
363}
364
365define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
366; SSE-LABEL: @test_v8i32(
367; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
368; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
369; SSE-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
370; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
371; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
372; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]]
373; SSE-NEXT:    [[R07:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
374; SSE-NEXT:    ret <8 x i32> [[R07]]
375;
376; SLM-LABEL: @test_v8i32(
377; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
378; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
379; SLM-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
380; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
381; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
382; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]]
383; SLM-NEXT:    [[R07:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
384; SLM-NEXT:    ret <8 x i32> [[R07]]
385;
386; AVX-LABEL: @test_v8i32(
387; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
388; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
389; AVX-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
390; AVX-NEXT:    ret <8 x i32> [[TMP3]]
391;
392; AVX512-LABEL: @test_v8i32(
393; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
394; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
395; AVX512-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
396; AVX512-NEXT:    ret <8 x i32> [[TMP3]]
397;
398  %a0 = extractelement <8 x i32> %a, i32 0
399  %a1 = extractelement <8 x i32> %a, i32 1
400  %a2 = extractelement <8 x i32> %a, i32 2
401  %a3 = extractelement <8 x i32> %a, i32 3
402  %a4 = extractelement <8 x i32> %a, i32 4
403  %a5 = extractelement <8 x i32> %a, i32 5
404  %a6 = extractelement <8 x i32> %a, i32 6
405  %a7 = extractelement <8 x i32> %a, i32 7
406  %b0 = extractelement <8 x i32> %b, i32 0
407  %b1 = extractelement <8 x i32> %b, i32 1
408  %b2 = extractelement <8 x i32> %b, i32 2
409  %b3 = extractelement <8 x i32> %b, i32 3
410  %b4 = extractelement <8 x i32> %b, i32 4
411  %b5 = extractelement <8 x i32> %b, i32 5
412  %b6 = extractelement <8 x i32> %b, i32 6
413  %b7 = extractelement <8 x i32> %b, i32 7
414  %r0 = add i32 %a0, %a1
415  %r1 = add i32 %a2, %a3
416  %r2 = add i32 %b0, %b1
417  %r3 = add i32 %b2, %b3
418  %r4 = add i32 %a4, %a5
419  %r5 = add i32 %a6, %a7
420  %r6 = add i32 %b4, %b5
421  %r7 = add i32 %b6, %b7
422  %r00 = insertelement <8 x i32> undef, i32 %r0, i32 0
423  %r01 = insertelement <8 x i32>  %r00, i32 %r1, i32 1
424  %r02 = insertelement <8 x i32>  %r01, i32 %r2, i32 2
425  %r03 = insertelement <8 x i32>  %r02, i32 %r3, i32 3
426  %r04 = insertelement <8 x i32>  %r03, i32 %r4, i32 4
427  %r05 = insertelement <8 x i32>  %r04, i32 %r5, i32 5
428  %r06 = insertelement <8 x i32>  %r05, i32 %r6, i32 6
429  %r07 = insertelement <8 x i32>  %r06, i32 %r7, i32 7
430  ret <8 x i32> %r07
431}
432
433define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
434; SSE-LABEL: @test_v16i16(
435; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
436; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
437; SSE-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
438; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
439; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
440; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]]
441; SSE-NEXT:    [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
442; SSE-NEXT:    ret <16 x i16> [[RV15]]
443;
444; SLM-LABEL: @test_v16i16(
445; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
446; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
447; SLM-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
448; SLM-NEXT:    ret <16 x i16> [[TMP3]]
449;
450; AVX-LABEL: @test_v16i16(
451; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
452; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
453; AVX-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
454; AVX-NEXT:    ret <16 x i16> [[TMP3]]
455;
456; AVX512-LABEL: @test_v16i16(
457; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
458; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
459; AVX512-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
460; AVX512-NEXT:    ret <16 x i16> [[TMP3]]
461;
462  %a0  = extractelement <16 x i16> %a, i32 0
463  %a1  = extractelement <16 x i16> %a, i32 1
464  %a2  = extractelement <16 x i16> %a, i32 2
465  %a3  = extractelement <16 x i16> %a, i32 3
466  %a4  = extractelement <16 x i16> %a, i32 4
467  %a5  = extractelement <16 x i16> %a, i32 5
468  %a6  = extractelement <16 x i16> %a, i32 6
469  %a7  = extractelement <16 x i16> %a, i32 7
470  %a8  = extractelement <16 x i16> %a, i32 8
471  %a9  = extractelement <16 x i16> %a, i32 9
472  %a10 = extractelement <16 x i16> %a, i32 10
473  %a11 = extractelement <16 x i16> %a, i32 11
474  %a12 = extractelement <16 x i16> %a, i32 12
475  %a13 = extractelement <16 x i16> %a, i32 13
476  %a14 = extractelement <16 x i16> %a, i32 14
477  %a15 = extractelement <16 x i16> %a, i32 15
478  %b0  = extractelement <16 x i16> %b, i32 0
479  %b1  = extractelement <16 x i16> %b, i32 1
480  %b2  = extractelement <16 x i16> %b, i32 2
481  %b3  = extractelement <16 x i16> %b, i32 3
482  %b4  = extractelement <16 x i16> %b, i32 4
483  %b5  = extractelement <16 x i16> %b, i32 5
484  %b6  = extractelement <16 x i16> %b, i32 6
485  %b7  = extractelement <16 x i16> %b, i32 7
486  %b8  = extractelement <16 x i16> %b, i32 8
487  %b9  = extractelement <16 x i16> %b, i32 9
488  %b10 = extractelement <16 x i16> %b, i32 10
489  %b11 = extractelement <16 x i16> %b, i32 11
490  %b12 = extractelement <16 x i16> %b, i32 12
491  %b13 = extractelement <16 x i16> %b, i32 13
492  %b14 = extractelement <16 x i16> %b, i32 14
493  %b15 = extractelement <16 x i16> %b, i32 15
494  %r0  = add i16 %a0 , %a1
495  %r1  = add i16 %a2 , %a3
496  %r2  = add i16 %a4 , %a5
497  %r3  = add i16 %a6 , %a7
498  %r4  = add i16 %b0 , %b1
499  %r5  = add i16 %b2 , %b3
500  %r6  = add i16 %b4 , %b5
501  %r7  = add i16 %b6 , %b7
502  %r8  = add i16 %a8 , %a9
503  %r9  = add i16 %a10, %a11
504  %r10 = add i16 %a12, %a13
505  %r11 = add i16 %a14, %a15
506  %r12 = add i16 %b8 , %b9
507  %r13 = add i16 %b10, %b11
508  %r14 = add i16 %b12, %b13
509  %r15 = add i16 %b14, %b15
510  %rv0  = insertelement <16 x i16> undef, i16 %r0 , i32 0
511  %rv1  = insertelement <16 x i16> %rv0 , i16 %r1 , i32 1
512  %rv2  = insertelement <16 x i16> %rv1 , i16 %r2 , i32 2
513  %rv3  = insertelement <16 x i16> %rv2 , i16 %r3 , i32 3
514  %rv4  = insertelement <16 x i16> %rv3 , i16 %r4 , i32 4
515  %rv5  = insertelement <16 x i16> %rv4 , i16 %r5 , i32 5
516  %rv6  = insertelement <16 x i16> %rv5 , i16 %r6 , i32 6
517  %rv7  = insertelement <16 x i16> %rv6 , i16 %r7 , i32 7
518  %rv8  = insertelement <16 x i16> %rv7 , i16 %r8 , i32 8
519  %rv9  = insertelement <16 x i16> %rv8 , i16 %r9 , i32 9
520  %rv10 = insertelement <16 x i16> %rv9 , i16 %r10, i32 10
521  %rv11 = insertelement <16 x i16> %rv10, i16 %r11, i32 11
522  %rv12 = insertelement <16 x i16> %rv11, i16 %r12, i32 12
523  %rv13 = insertelement <16 x i16> %rv12, i16 %r13, i32 13
524  %rv14 = insertelement <16 x i16> %rv13, i16 %r14, i32 14
525  %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
526  ret <16 x i16> %rv15
527}
528