1; Test various representations of pack-like operations.
2;
3; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
4
5; One way of writing a <4 x i32> -> <8 x i16> pack.
6define <8 x i16> @f1(<4 x i32> %val0, <4 x i32> %val1) {
7; CHECK-LABEL: f1:
8; CHECK: vpkf %v24, %v24, %v26
9; CHECK: br %r14
10  %elem0 = extractelement <4 x i32> %val0, i32 0
11  %elem1 = extractelement <4 x i32> %val0, i32 1
12  %elem2 = extractelement <4 x i32> %val0, i32 2
13  %elem3 = extractelement <4 x i32> %val0, i32 3
14  %elem4 = extractelement <4 x i32> %val1, i32 0
15  %elem5 = extractelement <4 x i32> %val1, i32 1
16  %elem6 = extractelement <4 x i32> %val1, i32 2
17  %elem7 = extractelement <4 x i32> %val1, i32 3
18  %hboth0 = bitcast i32 %elem0 to <2 x i16>
19  %hboth1 = bitcast i32 %elem1 to <2 x i16>
20  %hboth2 = bitcast i32 %elem2 to <2 x i16>
21  %hboth3 = bitcast i32 %elem3 to <2 x i16>
22  %hboth4 = bitcast i32 %elem4 to <2 x i16>
23  %hboth5 = bitcast i32 %elem5 to <2 x i16>
24  %hboth6 = bitcast i32 %elem6 to <2 x i16>
25  %hboth7 = bitcast i32 %elem7 to <2 x i16>
26  %hlow0 = shufflevector <2 x i16> %hboth0, <2 x i16> %hboth1,
27                         <2 x i32> <i32 1, i32 3>
28  %hlow1 = shufflevector <2 x i16> %hboth2, <2 x i16> %hboth3,
29                         <2 x i32> <i32 1, i32 3>
30  %hlow2 = shufflevector <2 x i16> %hboth4, <2 x i16> %hboth5,
31                         <2 x i32> <i32 1, i32 3>
32  %hlow3 = shufflevector <2 x i16> %hboth6, <2 x i16> %hboth7,
33                         <2 x i32> <i32 1, i32 3>
34  %join0 = shufflevector <2 x i16> %hlow0, <2 x i16> %hlow1,
35                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
36  %join1 = shufflevector <2 x i16> %hlow2, <2 x i16> %hlow3,
37                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
38  %ret = shufflevector <4 x i16> %join0, <4 x i16> %join1,
39                       <8 x i32> <i32 0, i32 1, i32 2, i32 3,
40                                  i32 4, i32 5, i32 6, i32 7>
41  ret <8 x i16> %ret
42}
43
44; A different way of writing a <4 x i32> -> <8 x i16> pack.
45define <8 x i16> @f2(<4 x i32> %val0, <4 x i32> %val1) {
46; CHECK-LABEL: f2:
47; CHECK: vpkf %v24, %v24, %v26
48; CHECK: br %r14
49  %elem0 = extractelement <4 x i32> %val0, i32 0
50  %elem1 = extractelement <4 x i32> %val0, i32 1
51  %elem2 = extractelement <4 x i32> %val0, i32 2
52  %elem3 = extractelement <4 x i32> %val0, i32 3
53  %elem4 = extractelement <4 x i32> %val1, i32 0
54  %elem5 = extractelement <4 x i32> %val1, i32 1
55  %elem6 = extractelement <4 x i32> %val1, i32 2
56  %elem7 = extractelement <4 x i32> %val1, i32 3
57  %wvec0 = insertelement <4 x i32> undef, i32 %elem0, i32 0
58  %wvec1 = insertelement <4 x i32> undef, i32 %elem1, i32 0
59  %wvec2 = insertelement <4 x i32> undef, i32 %elem2, i32 0
60  %wvec3 = insertelement <4 x i32> undef, i32 %elem3, i32 0
61  %wvec4 = insertelement <4 x i32> undef, i32 %elem4, i32 0
62  %wvec5 = insertelement <4 x i32> undef, i32 %elem5, i32 0
63  %wvec6 = insertelement <4 x i32> undef, i32 %elem6, i32 0
64  %wvec7 = insertelement <4 x i32> undef, i32 %elem7, i32 0
65  %hvec0 = bitcast <4 x i32> %wvec0 to <8 x i16>
66  %hvec1 = bitcast <4 x i32> %wvec1 to <8 x i16>
67  %hvec2 = bitcast <4 x i32> %wvec2 to <8 x i16>
68  %hvec3 = bitcast <4 x i32> %wvec3 to <8 x i16>
69  %hvec4 = bitcast <4 x i32> %wvec4 to <8 x i16>
70  %hvec5 = bitcast <4 x i32> %wvec5 to <8 x i16>
71  %hvec6 = bitcast <4 x i32> %wvec6 to <8 x i16>
72  %hvec7 = bitcast <4 x i32> %wvec7 to <8 x i16>
73  %hlow0 = shufflevector <8 x i16> %hvec0, <8 x i16> %hvec1,
74                         <8 x i32> <i32 1, i32 9, i32 undef, i32 undef,
75                                    i32 undef, i32 undef, i32 undef, i32 undef>
76  %hlow1 = shufflevector <8 x i16> %hvec2, <8 x i16> %hvec3,
77                         <8 x i32> <i32 1, i32 9, i32 undef, i32 undef,
78                                    i32 undef, i32 undef, i32 undef, i32 undef>
79  %hlow2 = shufflevector <8 x i16> %hvec4, <8 x i16> %hvec5,
80                         <8 x i32> <i32 1, i32 9, i32 undef, i32 undef,
81                                    i32 undef, i32 undef, i32 undef, i32 undef>
82  %hlow3 = shufflevector <8 x i16> %hvec6, <8 x i16> %hvec7,
83                         <8 x i32> <i32 1, i32 9, i32 undef, i32 undef,
84                                    i32 undef, i32 undef, i32 undef, i32 undef>
85  %join0 = shufflevector <8 x i16> %hlow0, <8 x i16> %hlow1,
86                         <8 x i32> <i32 0, i32 1, i32 8, i32 9,
87                                    i32 undef, i32 undef, i32 undef, i32 undef>
88  %join1 = shufflevector <8 x i16> %hlow2, <8 x i16> %hlow3,
89                         <8 x i32> <i32 0, i32 1, i32 8, i32 9,
90                                    i32 undef, i32 undef, i32 undef, i32 undef>
91  %ret = shufflevector <8 x i16> %join0, <8 x i16> %join1,
92                       <8 x i32> <i32 0, i32 1, i32 2, i32 3,
93                                  i32 8, i32 9, i32 10, i32 11>
94  ret <8 x i16> %ret
95}
96
97; A direct pack operation.
98define <8 x i16> @f3(<4 x i32> %val0, <4 x i32> %val1) {
99; CHECK-LABEL: f3:
100; CHECK: vpkf %v24, %v24, %v26
101; CHECK: br %r14
102  %bitcast0 = bitcast <4 x i32> %val0 to <8 x i16>
103  %bitcast1 = bitcast <4 x i32> %val1 to <8 x i16>
104  %ret = shufflevector <8 x i16> %bitcast0, <8 x i16> %bitcast1,
105                       <8 x i32> <i32 1, i32 3, i32 5, i32 7,
106                                  i32 9, i32 11, i32 13, i32 15>
107  ret <8 x i16> %ret
108}
109
110; One way of writing a <4 x i32> -> <16 x i8> pack.  It doesn't matter
111; whether the first pack is VPKF or VPKH since the even bytes of the
112; result are discarded.
113define <16 x i8> @f4(<4 x i32> %val0, <4 x i32> %val1,
114                     <4 x i32> %val2, <4 x i32> %val3) {
115; CHECK-LABEL: f4:
116; CHECK-DAG: vpk{{[hf]}} [[REG1:%v[0-9]+]], %v24, %v26
117; CHECK-DAG: vpk{{[hf]}} [[REG2:%v[0-9]+]], %v28, %v30
118; CHECK: vpkh %v24, [[REG1]], [[REG2]]
119; CHECK: br %r14
120  %bitcast0 = bitcast <4 x i32> %val0 to <8 x i16>
121  %bitcast1 = bitcast <4 x i32> %val1 to <8 x i16>
122  %bitcast2 = bitcast <4 x i32> %val2 to <8 x i16>
123  %bitcast3 = bitcast <4 x i32> %val3 to <8 x i16>
124  %join0 = shufflevector <8 x i16> %bitcast0, <8 x i16> %bitcast1,
125                         <8 x i32> <i32 1, i32 3, i32 5, i32 7,
126                                    i32 9, i32 11, i32 13, i32 15>
127  %join1 = shufflevector <8 x i16> %bitcast2, <8 x i16> %bitcast3,
128                         <8 x i32> <i32 1, i32 3, i32 5, i32 7,
129                                    i32 9, i32 11, i32 13, i32 15>
130  %bitcast4 = bitcast <8 x i16> %join0 to <16 x i8>
131  %bitcast5 = bitcast <8 x i16> %join1 to <16 x i8>
132  %ret = shufflevector <16 x i8> %bitcast4, <16 x i8> %bitcast5,
133                       <16 x i32> <i32 1, i32 3, i32 5, i32 7,
134                                   i32 9, i32 11, i32 13, i32 15,
135                                   i32 17, i32 19, i32 21, i32 23,
136                                   i32 25, i32 27, i32 29, i32 31>
137  ret <16 x i8> %ret
138}
139
140; Check the same operation, but with elements being extracted from the result.
141define void @f5(<4 x i32> %val0, <4 x i32> %val1,
142                <4 x i32> %val2, <4 x i32> %val3,
143                i8 *%base) {
144; CHECK-LABEL: f5:
145; CHECK-DAG: vsteb %v24, 0(%r2), 11
146; CHECK-DAG: vsteb %v26, 1(%r2), 15
147; CHECK-DAG: vsteb %v28, 2(%r2), 3
148; CHECK-DAG: vsteb %v30, 3(%r2), 7
149; CHECK: br %r14
150  %bitcast0 = bitcast <4 x i32> %val0 to <8 x i16>
151  %bitcast1 = bitcast <4 x i32> %val1 to <8 x i16>
152  %bitcast2 = bitcast <4 x i32> %val2 to <8 x i16>
153  %bitcast3 = bitcast <4 x i32> %val3 to <8 x i16>
154  %join0 = shufflevector <8 x i16> %bitcast0, <8 x i16> %bitcast1,
155                         <8 x i32> <i32 1, i32 3, i32 5, i32 7,
156                                    i32 9, i32 11, i32 13, i32 15>
157  %join1 = shufflevector <8 x i16> %bitcast2, <8 x i16> %bitcast3,
158                         <8 x i32> <i32 1, i32 3, i32 5, i32 7,
159                                    i32 9, i32 11, i32 13, i32 15>
160  %bitcast4 = bitcast <8 x i16> %join0 to <16 x i8>
161  %bitcast5 = bitcast <8 x i16> %join1 to <16 x i8>
162  %vec = shufflevector <16 x i8> %bitcast4, <16 x i8> %bitcast5,
163                       <16 x i32> <i32 1, i32 3, i32 5, i32 7,
164                                   i32 9, i32 11, i32 13, i32 15,
165                                   i32 17, i32 19, i32 21, i32 23,
166                                   i32 25, i32 27, i32 29, i32 31>
167
168  %ptr0 = getelementptr i8, i8 *%base, i64 0
169  %ptr1 = getelementptr i8, i8 *%base, i64 1
170  %ptr2 = getelementptr i8, i8 *%base, i64 2
171  %ptr3 = getelementptr i8, i8 *%base, i64 3
172
173  %byte0 = extractelement <16 x i8> %vec, i32 2
174  %byte1 = extractelement <16 x i8> %vec, i32 7
175  %byte2 = extractelement <16 x i8> %vec, i32 8
176  %byte3 = extractelement <16 x i8> %vec, i32 13
177
178  store i8 %byte0, i8 *%ptr0
179  store i8 %byte1, i8 *%ptr1
180  store i8 %byte2, i8 *%ptr2
181  store i8 %byte3, i8 *%ptr3
182
183  ret void
184}
185
186; A different way of writing a <4 x i32> -> <16 x i8> pack.
187define <16 x i8> @f6(<4 x i32> %val0, <4 x i32> %val1,
188                     <4 x i32> %val2, <4 x i32> %val3) {
189; CHECK-LABEL: f6:
190; CHECK-DAG: vpk{{[hf]}} [[REG1:%v[0-9]+]], %v24, %v26
191; CHECK-DAG: vpk{{[hf]}} [[REG2:%v[0-9]+]], %v28, %v30
192; CHECK: vpkh %v24, [[REG1]], [[REG2]]
193; CHECK: br %r14
194  %elem0 = extractelement <4 x i32> %val0, i32 0
195  %elem1 = extractelement <4 x i32> %val0, i32 1
196  %elem2 = extractelement <4 x i32> %val0, i32 2
197  %elem3 = extractelement <4 x i32> %val0, i32 3
198  %elem4 = extractelement <4 x i32> %val1, i32 0
199  %elem5 = extractelement <4 x i32> %val1, i32 1
200  %elem6 = extractelement <4 x i32> %val1, i32 2
201  %elem7 = extractelement <4 x i32> %val1, i32 3
202  %elem8 = extractelement <4 x i32> %val2, i32 0
203  %elem9 = extractelement <4 x i32> %val2, i32 1
204  %elem10 = extractelement <4 x i32> %val2, i32 2
205  %elem11 = extractelement <4 x i32> %val2, i32 3
206  %elem12 = extractelement <4 x i32> %val3, i32 0
207  %elem13 = extractelement <4 x i32> %val3, i32 1
208  %elem14 = extractelement <4 x i32> %val3, i32 2
209  %elem15 = extractelement <4 x i32> %val3, i32 3
210  %bitcast0 = bitcast i32 %elem0 to <2 x i16>
211  %bitcast1 = bitcast i32 %elem1 to <2 x i16>
212  %bitcast2 = bitcast i32 %elem2 to <2 x i16>
213  %bitcast3 = bitcast i32 %elem3 to <2 x i16>
214  %bitcast4 = bitcast i32 %elem4 to <2 x i16>
215  %bitcast5 = bitcast i32 %elem5 to <2 x i16>
216  %bitcast6 = bitcast i32 %elem6 to <2 x i16>
217  %bitcast7 = bitcast i32 %elem7 to <2 x i16>
218  %bitcast8 = bitcast i32 %elem8 to <2 x i16>
219  %bitcast9 = bitcast i32 %elem9 to <2 x i16>
220  %bitcast10 = bitcast i32 %elem10 to <2 x i16>
221  %bitcast11 = bitcast i32 %elem11 to <2 x i16>
222  %bitcast12 = bitcast i32 %elem12 to <2 x i16>
223  %bitcast13 = bitcast i32 %elem13 to <2 x i16>
224  %bitcast14 = bitcast i32 %elem14 to <2 x i16>
225  %bitcast15 = bitcast i32 %elem15 to <2 x i16>
226  %low0 = shufflevector <2 x i16> %bitcast0, <2 x i16> %bitcast1,
227                        <2 x i32> <i32 1, i32 3>
228  %low1 = shufflevector <2 x i16> %bitcast2, <2 x i16> %bitcast3,
229                        <2 x i32> <i32 1, i32 3>
230  %low2 = shufflevector <2 x i16> %bitcast4, <2 x i16> %bitcast5,
231                        <2 x i32> <i32 1, i32 3>
232  %low3 = shufflevector <2 x i16> %bitcast6, <2 x i16> %bitcast7,
233                        <2 x i32> <i32 1, i32 3>
234  %low4 = shufflevector <2 x i16> %bitcast8, <2 x i16> %bitcast9,
235                        <2 x i32> <i32 1, i32 3>
236  %low5 = shufflevector <2 x i16> %bitcast10, <2 x i16> %bitcast11,
237                        <2 x i32> <i32 1, i32 3>
238  %low6 = shufflevector <2 x i16> %bitcast12, <2 x i16> %bitcast13,
239                        <2 x i32> <i32 1, i32 3>
240  %low7 = shufflevector <2 x i16> %bitcast14, <2 x i16> %bitcast15,
241                        <2 x i32> <i32 1, i32 3>
242  %bytes0 = bitcast <2 x i16> %low0 to <4 x i8>
243  %bytes1 = bitcast <2 x i16> %low1 to <4 x i8>
244  %bytes2 = bitcast <2 x i16> %low2 to <4 x i8>
245  %bytes3 = bitcast <2 x i16> %low3 to <4 x i8>
246  %bytes4 = bitcast <2 x i16> %low4 to <4 x i8>
247  %bytes5 = bitcast <2 x i16> %low5 to <4 x i8>
248  %bytes6 = bitcast <2 x i16> %low6 to <4 x i8>
249  %bytes7 = bitcast <2 x i16> %low7 to <4 x i8>
250  %blow0 = shufflevector <4 x i8> %bytes0, <4 x i8> %bytes1,
251                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
252  %blow1 = shufflevector <4 x i8> %bytes2, <4 x i8> %bytes3,
253                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
254  %blow2 = shufflevector <4 x i8> %bytes4, <4 x i8> %bytes5,
255                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
256  %blow3 = shufflevector <4 x i8> %bytes6, <4 x i8> %bytes7,
257                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
258  %join0 = shufflevector <4 x i8> %blow0, <4 x i8> %blow1,
259                         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
260                                    i32 4, i32 5, i32 6, i32 7>
261  %join1 = shufflevector <4 x i8> %blow2, <4 x i8> %blow3,
262                         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
263                                    i32 4, i32 5, i32 6, i32 7>
264  %ret = shufflevector <8 x i8> %join0, <8 x i8> %join1,
265                       <16 x i32> <i32 0, i32 1, i32 2, i32 3,
266                                   i32 4, i32 5, i32 6, i32 7,
267                                   i32 8, i32 9, i32 10, i32 11,
268                                   i32 12, i32 13, i32 14, i32 15>
269  ret <16 x i8> %ret
270}
271
272; One way of writing a <2 x i64> -> <16 x i8> pack.
273define <16 x i8> @f7(<2 x i64> %val0, <2 x i64> %val1,
274                     <2 x i64> %val2, <2 x i64> %val3,
275                     <2 x i64> %val4, <2 x i64> %val5,
276                     <2 x i64> %val6, <2 x i64> %val7) {
277; CHECK-LABEL: f7:
278; CHECK-DAG: vpk{{[hfg]}} [[REG1:%v[0-9]+]], %v24, %v26
279; CHECK-DAG: vpk{{[hfg]}} [[REG2:%v[0-9]+]], %v28, %v30
280; CHECK-DAG: vpk{{[hfg]}} [[REG3:%v[0-9]+]], %v25, %v27
281; CHECK-DAG: vpk{{[hfg]}} [[REG4:%v[0-9]+]], %v29, %v31
282; CHECK-DAG: vpk{{[hf]}} [[REG5:%v[0-9]+]], [[REG1]], [[REG2]]
283; CHECK-DAG: vpk{{[hf]}} [[REG6:%v[0-9]+]], [[REG3]], [[REG4]]
284; CHECK: vpkh %v24, [[REG5]], [[REG6]]
285; CHECK: br %r14
286  %elem0 = extractelement <2 x i64> %val0, i32 0
287  %elem1 = extractelement <2 x i64> %val0, i32 1
288  %elem2 = extractelement <2 x i64> %val1, i32 0
289  %elem3 = extractelement <2 x i64> %val1, i32 1
290  %elem4 = extractelement <2 x i64> %val2, i32 0
291  %elem5 = extractelement <2 x i64> %val2, i32 1
292  %elem6 = extractelement <2 x i64> %val3, i32 0
293  %elem7 = extractelement <2 x i64> %val3, i32 1
294  %elem8 = extractelement <2 x i64> %val4, i32 0
295  %elem9 = extractelement <2 x i64> %val4, i32 1
296  %elem10 = extractelement <2 x i64> %val5, i32 0
297  %elem11 = extractelement <2 x i64> %val5, i32 1
298  %elem12 = extractelement <2 x i64> %val6, i32 0
299  %elem13 = extractelement <2 x i64> %val6, i32 1
300  %elem14 = extractelement <2 x i64> %val7, i32 0
301  %elem15 = extractelement <2 x i64> %val7, i32 1
302  %bitcast0 = bitcast i64 %elem0 to <2 x i32>
303  %bitcast1 = bitcast i64 %elem1 to <2 x i32>
304  %bitcast2 = bitcast i64 %elem2 to <2 x i32>
305  %bitcast3 = bitcast i64 %elem3 to <2 x i32>
306  %bitcast4 = bitcast i64 %elem4 to <2 x i32>
307  %bitcast5 = bitcast i64 %elem5 to <2 x i32>
308  %bitcast6 = bitcast i64 %elem6 to <2 x i32>
309  %bitcast7 = bitcast i64 %elem7 to <2 x i32>
310  %bitcast8 = bitcast i64 %elem8 to <2 x i32>
311  %bitcast9 = bitcast i64 %elem9 to <2 x i32>
312  %bitcast10 = bitcast i64 %elem10 to <2 x i32>
313  %bitcast11 = bitcast i64 %elem11 to <2 x i32>
314  %bitcast12 = bitcast i64 %elem12 to <2 x i32>
315  %bitcast13 = bitcast i64 %elem13 to <2 x i32>
316  %bitcast14 = bitcast i64 %elem14 to <2 x i32>
317  %bitcast15 = bitcast i64 %elem15 to <2 x i32>
318  %low0 = shufflevector <2 x i32> %bitcast0, <2 x i32> %bitcast1,
319                        <2 x i32> <i32 1, i32 3>
320  %low1 = shufflevector <2 x i32> %bitcast2, <2 x i32> %bitcast3,
321                        <2 x i32> <i32 1, i32 3>
322  %low2 = shufflevector <2 x i32> %bitcast4, <2 x i32> %bitcast5,
323                        <2 x i32> <i32 1, i32 3>
324  %low3 = shufflevector <2 x i32> %bitcast6, <2 x i32> %bitcast7,
325                        <2 x i32> <i32 1, i32 3>
326  %low4 = shufflevector <2 x i32> %bitcast8, <2 x i32> %bitcast9,
327                        <2 x i32> <i32 1, i32 3>
328  %low5 = shufflevector <2 x i32> %bitcast10, <2 x i32> %bitcast11,
329                        <2 x i32> <i32 1, i32 3>
330  %low6 = shufflevector <2 x i32> %bitcast12, <2 x i32> %bitcast13,
331                        <2 x i32> <i32 1, i32 3>
332  %low7 = shufflevector <2 x i32> %bitcast14, <2 x i32> %bitcast15,
333                        <2 x i32> <i32 1, i32 3>
334  %half0 = bitcast <2 x i32> %low0 to <4 x i16>
335  %half1 = bitcast <2 x i32> %low1 to <4 x i16>
336  %half2 = bitcast <2 x i32> %low2 to <4 x i16>
337  %half3 = bitcast <2 x i32> %low3 to <4 x i16>
338  %half4 = bitcast <2 x i32> %low4 to <4 x i16>
339  %half5 = bitcast <2 x i32> %low5 to <4 x i16>
340  %half6 = bitcast <2 x i32> %low6 to <4 x i16>
341  %half7 = bitcast <2 x i32> %low7 to <4 x i16>
342  %hlow0 = shufflevector <4 x i16> %half0, <4 x i16> %half1,
343                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
344  %hlow1 = shufflevector <4 x i16> %half2, <4 x i16> %half3,
345                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
346  %hlow2 = shufflevector <4 x i16> %half4, <4 x i16> %half5,
347                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
348  %hlow3 = shufflevector <4 x i16> %half6, <4 x i16> %half7,
349                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
350  %bytes0 = bitcast <4 x i16> %hlow0 to <8 x i8>
351  %bytes1 = bitcast <4 x i16> %hlow1 to <8 x i8>
352  %bytes2 = bitcast <4 x i16> %hlow2 to <8 x i8>
353  %bytes3 = bitcast <4 x i16> %hlow3 to <8 x i8>
354  %join0 = shufflevector <8 x i8> %bytes0, <8 x i8> %bytes1,
355                         <8 x i32> <i32 1, i32 3, i32 5, i32 7,
356                                    i32 9, i32 11, i32 13, i32 15>
357  %join1 = shufflevector <8 x i8> %bytes2, <8 x i8> %bytes3,
358                         <8 x i32> <i32 1, i32 3, i32 5, i32 7,
359                                    i32 9, i32 11, i32 13, i32 15>
360  %ret = shufflevector <8 x i8> %join0, <8 x i8> %join1,
361                       <16 x i32> <i32 0, i32 1, i32 2, i32 3,
362                                   i32 4, i32 5, i32 6, i32 7,
363                                   i32 8, i32 9, i32 10, i32 11,
364                                   i32 12, i32 13, i32 14, i32 15>
365  ret <16 x i8> %ret
366}
367
368; Test a <2 x i64> -> <4 x f32> pack in which only individual elements are
369; needed.
370define float @f8(i64 %scalar0, i64 %scalar1, i64 %scalar2, i64 %scalar3) {
371; CHECK-LABEL: f8:
372; CHECK-NOT: vperm
373; CHECK-NOT: vpk
374; CHECK-NOT: vmrh
375; CHECK: aebr {{%f[0-7]}},
376; CHECK: aebr {{%f[0-7]}},
377; CHECK: meebr %f0,
378; CHECK: br %r14
379  %vec0 = insertelement <2 x i64> undef, i64 %scalar0, i32 0
380  %vec1 = insertelement <2 x i64> undef, i64 %scalar1, i32 0
381  %vec2 = insertelement <2 x i64> undef, i64 %scalar2, i32 0
382  %vec3 = insertelement <2 x i64> undef, i64 %scalar3, i32 0
383  %join0 = shufflevector <2 x i64> %vec0, <2 x i64> %vec1,
384                         <2 x i32> <i32 0, i32 2>
385  %join1 = shufflevector <2 x i64> %vec2, <2 x i64> %vec3,
386                         <2 x i32> <i32 0, i32 2>
387  %bitcast0 = bitcast <2 x i64> %join0 to <4 x float>
388  %bitcast1 = bitcast <2 x i64> %join1 to <4 x float>
389  %pack = shufflevector <4 x float> %bitcast0, <4 x float> %bitcast1,
390                        <4 x i32> <i32 1, i32 3, i32 5, i32 7>
391  %elt0 = extractelement <4 x float> %pack, i32 0
392  %elt1 = extractelement <4 x float> %pack, i32 1
393  %elt2 = extractelement <4 x float> %pack, i32 2
394  %elt3 = extractelement <4 x float> %pack, i32 3
395  %add0 = fadd float %elt0, %elt2
396  %add1 = fadd float %elt1, %elt3
397  %ret = fmul float %add0, %add1
398  ret float %ret
399}
400
401; Test a <2 x f64> -> <4 x i32> pack in which only individual elements are
402; needed.
403define i32 @f9(double %scalar0, double %scalar1, double %scalar2,
404               double %scalar3) {
405; CHECK-LABEL: f9:
406; CHECK-NOT: vperm
407; CHECK-NOT: vpk
408; CHECK-NOT: vmrh
409; CHECK: ar {{%r[0-5]}},
410; CHECK: ar {{%r[0-5]}},
411; CHECK: or %r2,
412; CHECK: br %r14
413  %vec0 = insertelement <2 x double> undef, double %scalar0, i32 0
414  %vec1 = insertelement <2 x double> undef, double %scalar1, i32 0
415  %vec2 = insertelement <2 x double> undef, double %scalar2, i32 0
416  %vec3 = insertelement <2 x double> undef, double %scalar3, i32 0
417  %join0 = shufflevector <2 x double> %vec0, <2 x double> %vec1,
418                         <2 x i32> <i32 0, i32 2>
419  %join1 = shufflevector <2 x double> %vec2, <2 x double> %vec3,
420                         <2 x i32> <i32 0, i32 2>
421  %bitcast0 = bitcast <2 x double> %join0 to <4 x i32>
422  %bitcast1 = bitcast <2 x double> %join1 to <4 x i32>
423  %pack = shufflevector <4 x i32> %bitcast0, <4 x i32> %bitcast1,
424                        <4 x i32> <i32 1, i32 3, i32 5, i32 7>
425  %elt0 = extractelement <4 x i32> %pack, i32 0
426  %elt1 = extractelement <4 x i32> %pack, i32 1
427  %elt2 = extractelement <4 x i32> %pack, i32 2
428  %elt3 = extractelement <4 x i32> %pack, i32 3
429  %add0 = add i32 %elt0, %elt2
430  %add1 = add i32 %elt1, %elt3
431  %ret = or i32 %add0, %add1
432  ret i32 %ret
433}
434