1; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
2
3define void @test_vext_s8() nounwind ssp {
4  ; CHECK-LABEL: test_vext_s8:
5  ; CHECK: {{ext.8.*#1}}
6  %xS8x8 = alloca <8 x i8>, align 8
7  %__a = alloca <8 x i8>, align 8
8  %__b = alloca <8 x i8>, align 8
9  %tmp = load <8 x i8>, <8 x i8>* %xS8x8, align 8
10  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
11  %tmp1 = load <8 x i8>, <8 x i8>* %xS8x8, align 8
12  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
13  %tmp2 = load <8 x i8>, <8 x i8>* %__a, align 8
14  %tmp3 = load <8 x i8>, <8 x i8>* %__b, align 8
15  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
16  store <8 x i8> %vext, <8 x i8>* %xS8x8, align 8
17  ret void
18}
19
20define void @test_vext_u8() nounwind ssp {
21  ; CHECK-LABEL: test_vext_u8:
22  ; CHECK: {{ext.8.*#2}}
23  %xU8x8 = alloca <8 x i8>, align 8
24  %__a = alloca <8 x i8>, align 8
25  %__b = alloca <8 x i8>, align 8
26  %tmp = load <8 x i8>, <8 x i8>* %xU8x8, align 8
27  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
28  %tmp1 = load <8 x i8>, <8 x i8>* %xU8x8, align 8
29  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
30  %tmp2 = load <8 x i8>, <8 x i8>* %__a, align 8
31  %tmp3 = load <8 x i8>, <8 x i8>* %__b, align 8
32  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
33  store <8 x i8> %vext, <8 x i8>* %xU8x8, align 8
34  ret void
35}
36
37define void @test_vext_p8() nounwind ssp {
38  ; CHECK-LABEL: test_vext_p8:
39  ; CHECK: {{ext.8.*#3}}
40  %xP8x8 = alloca <8 x i8>, align 8
41  %__a = alloca <8 x i8>, align 8
42  %__b = alloca <8 x i8>, align 8
43  %tmp = load <8 x i8>, <8 x i8>* %xP8x8, align 8
44  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
45  %tmp1 = load <8 x i8>, <8 x i8>* %xP8x8, align 8
46  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
47  %tmp2 = load <8 x i8>, <8 x i8>* %__a, align 8
48  %tmp3 = load <8 x i8>, <8 x i8>* %__b, align 8
49  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
50  store <8 x i8> %vext, <8 x i8>* %xP8x8, align 8
51  ret void
52}
53
54define void @test_vext_s16() nounwind ssp {
55  ; CHECK-LABEL: test_vext_s16:
56  ; CHECK: {{ext.8.*#2}}
57  %xS16x4 = alloca <4 x i16>, align 8
58  %__a = alloca <4 x i16>, align 8
59  %__b = alloca <4 x i16>, align 8
60  %tmp = load <4 x i16>, <4 x i16>* %xS16x4, align 8
61  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
62  %tmp1 = load <4 x i16>, <4 x i16>* %xS16x4, align 8
63  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
64  %tmp2 = load <4 x i16>, <4 x i16>* %__a, align 8
65  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
66  %tmp4 = load <4 x i16>, <4 x i16>* %__b, align 8
67  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
68  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
69  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
70  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
71  store <4 x i16> %vext, <4 x i16>* %xS16x4, align 8
72  ret void
73}
74
75define void @test_vext_u16() nounwind ssp {
76  ; CHECK-LABEL: test_vext_u16:
77  ; CHECK: {{ext.8.*#4}}
78  %xU16x4 = alloca <4 x i16>, align 8
79  %__a = alloca <4 x i16>, align 8
80  %__b = alloca <4 x i16>, align 8
81  %tmp = load <4 x i16>, <4 x i16>* %xU16x4, align 8
82  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
83  %tmp1 = load <4 x i16>, <4 x i16>* %xU16x4, align 8
84  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
85  %tmp2 = load <4 x i16>, <4 x i16>* %__a, align 8
86  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
87  %tmp4 = load <4 x i16>, <4 x i16>* %__b, align 8
88  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
89  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
90  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
91  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
92  store <4 x i16> %vext, <4 x i16>* %xU16x4, align 8
93  ret void
94}
95
96define void @test_vext_p16() nounwind ssp {
97  ; CHECK-LABEL: test_vext_p16:
98  ; CHECK: {{ext.8.*#6}}
99  %xP16x4 = alloca <4 x i16>, align 8
100  %__a = alloca <4 x i16>, align 8
101  %__b = alloca <4 x i16>, align 8
102  %tmp = load <4 x i16>, <4 x i16>* %xP16x4, align 8
103  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
104  %tmp1 = load <4 x i16>, <4 x i16>* %xP16x4, align 8
105  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
106  %tmp2 = load <4 x i16>, <4 x i16>* %__a, align 8
107  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
108  %tmp4 = load <4 x i16>, <4 x i16>* %__b, align 8
109  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
110  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
111  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
112  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
113  store <4 x i16> %vext, <4 x i16>* %xP16x4, align 8
114  ret void
115}
116
117define void @test_vext_s32() nounwind ssp {
118  ; CHECK-LABEL: test_vext_s32:
119  ; CHECK: {{ext.8.*#4}}
120  %xS32x2 = alloca <2 x i32>, align 8
121  %__a = alloca <2 x i32>, align 8
122  %__b = alloca <2 x i32>, align 8
123  %tmp = load <2 x i32>, <2 x i32>* %xS32x2, align 8
124  store <2 x i32> %tmp, <2 x i32>* %__a, align 8
125  %tmp1 = load <2 x i32>, <2 x i32>* %xS32x2, align 8
126  store <2 x i32> %tmp1, <2 x i32>* %__b, align 8
127  %tmp2 = load <2 x i32>, <2 x i32>* %__a, align 8
128  %tmp3 = bitcast <2 x i32> %tmp2 to <8 x i8>
129  %tmp4 = load <2 x i32>, <2 x i32>* %__b, align 8
130  %tmp5 = bitcast <2 x i32> %tmp4 to <8 x i8>
131  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x i32>
132  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x i32>
133  %vext = shufflevector <2 x i32> %tmp6, <2 x i32> %tmp7, <2 x i32> <i32 1, i32 2>
134  store <2 x i32> %vext, <2 x i32>* %xS32x2, align 8
135  ret void
136}
137
138define void @test_vext_u32() nounwind ssp {
139  ; CHECK-LABEL: test_vext_u32:
140  ; CHECK: {{ext.8.*#4}}
141  %xU32x2 = alloca <2 x i32>, align 8
142  %__a = alloca <2 x i32>, align 8
143  %__b = alloca <2 x i32>, align 8
144  %tmp = load <2 x i32>, <2 x i32>* %xU32x2, align 8
145  store <2 x i32> %tmp, <2 x i32>* %__a, align 8
146  %tmp1 = load <2 x i32>, <2 x i32>* %xU32x2, align 8
147  store <2 x i32> %tmp1, <2 x i32>* %__b, align 8
148  %tmp2 = load <2 x i32>, <2 x i32>* %__a, align 8
149  %tmp3 = bitcast <2 x i32> %tmp2 to <8 x i8>
150  %tmp4 = load <2 x i32>, <2 x i32>* %__b, align 8
151  %tmp5 = bitcast <2 x i32> %tmp4 to <8 x i8>
152  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x i32>
153  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x i32>
154  %vext = shufflevector <2 x i32> %tmp6, <2 x i32> %tmp7, <2 x i32> <i32 1, i32 2>
155  store <2 x i32> %vext, <2 x i32>* %xU32x2, align 8
156  ret void
157}
158
159define void @test_vext_f32() nounwind ssp {
160  ; CHECK-LABEL: test_vext_f32:
161  ; CHECK: {{ext.8.*#4}}
162  %xF32x2 = alloca <2 x float>, align 8
163  %__a = alloca <2 x float>, align 8
164  %__b = alloca <2 x float>, align 8
165  %tmp = load <2 x float>, <2 x float>* %xF32x2, align 8
166  store <2 x float> %tmp, <2 x float>* %__a, align 8
167  %tmp1 = load <2 x float>, <2 x float>* %xF32x2, align 8
168  store <2 x float> %tmp1, <2 x float>* %__b, align 8
169  %tmp2 = load <2 x float>, <2 x float>* %__a, align 8
170  %tmp3 = bitcast <2 x float> %tmp2 to <8 x i8>
171  %tmp4 = load <2 x float>, <2 x float>* %__b, align 8
172  %tmp5 = bitcast <2 x float> %tmp4 to <8 x i8>
173  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x float>
174  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x float>
175  %vext = shufflevector <2 x float> %tmp6, <2 x float> %tmp7, <2 x i32> <i32 1, i32 2>
176  store <2 x float> %vext, <2 x float>* %xF32x2, align 8
177  ret void
178}
179
180define void @test_vext_s64() nounwind ssp {
181  ; CHECK-LABEL: test_vext_s64:
182  ; CHECK_FIXME: {{ext.8.*#1}}
183  ; this just turns into a load of the second element
184  %xS64x1 = alloca <1 x i64>, align 8
185  %__a = alloca <1 x i64>, align 8
186  %__b = alloca <1 x i64>, align 8
187  %tmp = load <1 x i64>, <1 x i64>* %xS64x1, align 8
188  store <1 x i64> %tmp, <1 x i64>* %__a, align 8
189  %tmp1 = load <1 x i64>, <1 x i64>* %xS64x1, align 8
190  store <1 x i64> %tmp1, <1 x i64>* %__b, align 8
191  %tmp2 = load <1 x i64>, <1 x i64>* %__a, align 8
192  %tmp3 = bitcast <1 x i64> %tmp2 to <8 x i8>
193  %tmp4 = load <1 x i64>, <1 x i64>* %__b, align 8
194  %tmp5 = bitcast <1 x i64> %tmp4 to <8 x i8>
195  %tmp6 = bitcast <8 x i8> %tmp3 to <1 x i64>
196  %tmp7 = bitcast <8 x i8> %tmp5 to <1 x i64>
197  %vext = shufflevector <1 x i64> %tmp6, <1 x i64> %tmp7, <1 x i32> <i32 1>
198  store <1 x i64> %vext, <1 x i64>* %xS64x1, align 8
199  ret void
200}
201
202define void @test_vext_u64() nounwind ssp {
203  ; CHECK-LABEL: test_vext_u64:
204  ; CHECK_FIXME: {{ext.8.*#1}}
205  ; this is turned into a simple load of the 2nd element
206  %xU64x1 = alloca <1 x i64>, align 8
207  %__a = alloca <1 x i64>, align 8
208  %__b = alloca <1 x i64>, align 8
209  %tmp = load <1 x i64>, <1 x i64>* %xU64x1, align 8
210  store <1 x i64> %tmp, <1 x i64>* %__a, align 8
211  %tmp1 = load <1 x i64>, <1 x i64>* %xU64x1, align 8
212  store <1 x i64> %tmp1, <1 x i64>* %__b, align 8
213  %tmp2 = load <1 x i64>, <1 x i64>* %__a, align 8
214  %tmp3 = bitcast <1 x i64> %tmp2 to <8 x i8>
215  %tmp4 = load <1 x i64>, <1 x i64>* %__b, align 8
216  %tmp5 = bitcast <1 x i64> %tmp4 to <8 x i8>
217  %tmp6 = bitcast <8 x i8> %tmp3 to <1 x i64>
218  %tmp7 = bitcast <8 x i8> %tmp5 to <1 x i64>
219  %vext = shufflevector <1 x i64> %tmp6, <1 x i64> %tmp7, <1 x i32> <i32 1>
220  store <1 x i64> %vext, <1 x i64>* %xU64x1, align 8
221  ret void
222}
223
224define void @test_vextq_s8() nounwind ssp {
225  ; CHECK-LABEL: test_vextq_s8:
226  ; CHECK: {{ext.16.*#4}}
227  %xS8x16 = alloca <16 x i8>, align 16
228  %__a = alloca <16 x i8>, align 16
229  %__b = alloca <16 x i8>, align 16
230  %tmp = load <16 x i8>, <16 x i8>* %xS8x16, align 16
231  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
232  %tmp1 = load <16 x i8>, <16 x i8>* %xS8x16, align 16
233  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
234  %tmp2 = load <16 x i8>, <16 x i8>* %__a, align 16
235  %tmp3 = load <16 x i8>, <16 x i8>* %__b, align 16
236  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
237  store <16 x i8> %vext, <16 x i8>* %xS8x16, align 16
238  ret void
239}
240
241define void @test_vextq_u8() nounwind ssp {
242  ; CHECK-LABEL: test_vextq_u8:
243  ; CHECK: {{ext.16.*#5}}
244  %xU8x16 = alloca <16 x i8>, align 16
245  %__a = alloca <16 x i8>, align 16
246  %__b = alloca <16 x i8>, align 16
247  %tmp = load <16 x i8>, <16 x i8>* %xU8x16, align 16
248  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
249  %tmp1 = load <16 x i8>, <16 x i8>* %xU8x16, align 16
250  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
251  %tmp2 = load <16 x i8>, <16 x i8>* %__a, align 16
252  %tmp3 = load <16 x i8>, <16 x i8>* %__b, align 16
253  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
254  store <16 x i8> %vext, <16 x i8>* %xU8x16, align 16
255  ret void
256}
257
258define void @test_vextq_p8() nounwind ssp {
259  ; CHECK-LABEL: test_vextq_p8:
260  ; CHECK: {{ext.16.*#6}}
261  %xP8x16 = alloca <16 x i8>, align 16
262  %__a = alloca <16 x i8>, align 16
263  %__b = alloca <16 x i8>, align 16
264  %tmp = load <16 x i8>, <16 x i8>* %xP8x16, align 16
265  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
266  %tmp1 = load <16 x i8>, <16 x i8>* %xP8x16, align 16
267  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
268  %tmp2 = load <16 x i8>, <16 x i8>* %__a, align 16
269  %tmp3 = load <16 x i8>, <16 x i8>* %__b, align 16
270  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
271  store <16 x i8> %vext, <16 x i8>* %xP8x16, align 16
272  ret void
273}
274
275define void @test_vextq_s16() nounwind ssp {
276  ; CHECK-LABEL: test_vextq_s16:
277  ; CHECK: {{ext.16.*#14}}
278  %xS16x8 = alloca <8 x i16>, align 16
279  %__a = alloca <8 x i16>, align 16
280  %__b = alloca <8 x i16>, align 16
281  %tmp = load <8 x i16>, <8 x i16>* %xS16x8, align 16
282  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
283  %tmp1 = load <8 x i16>, <8 x i16>* %xS16x8, align 16
284  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
285  %tmp2 = load <8 x i16>, <8 x i16>* %__a, align 16
286  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
287  %tmp4 = load <8 x i16>, <8 x i16>* %__b, align 16
288  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
289  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
290  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
291  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
292  store <8 x i16> %vext, <8 x i16>* %xS16x8, align 16
293  ret void
294}
295
296define void @test_vextq_u16() nounwind ssp {
297  ; CHECK-LABEL: test_vextq_u16:
298  ; CHECK: {{ext.16.*#8}}
299  %xU16x8 = alloca <8 x i16>, align 16
300  %__a = alloca <8 x i16>, align 16
301  %__b = alloca <8 x i16>, align 16
302  %tmp = load <8 x i16>, <8 x i16>* %xU16x8, align 16
303  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
304  %tmp1 = load <8 x i16>, <8 x i16>* %xU16x8, align 16
305  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
306  %tmp2 = load <8 x i16>, <8 x i16>* %__a, align 16
307  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
308  %tmp4 = load <8 x i16>, <8 x i16>* %__b, align 16
309  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
310  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
311  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
312  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
313  store <8 x i16> %vext, <8 x i16>* %xU16x8, align 16
314  ret void
315}
316
317define void @test_vextq_p16() nounwind ssp {
318  ; CHECK-LABEL: test_vextq_p16:
319  ; CHECK: {{ext.16.*#10}}
320  %xP16x8 = alloca <8 x i16>, align 16
321  %__a = alloca <8 x i16>, align 16
322  %__b = alloca <8 x i16>, align 16
323  %tmp = load <8 x i16>, <8 x i16>* %xP16x8, align 16
324  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
325  %tmp1 = load <8 x i16>, <8 x i16>* %xP16x8, align 16
326  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
327  %tmp2 = load <8 x i16>, <8 x i16>* %__a, align 16
328  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
329  %tmp4 = load <8 x i16>, <8 x i16>* %__b, align 16
330  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
331  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
332  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
333  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
334  store <8 x i16> %vext, <8 x i16>* %xP16x8, align 16
335  ret void
336}
337
338define void @test_vextq_s32() nounwind ssp {
339  ; CHECK-LABEL: test_vextq_s32:
340  ; CHECK: {{ext.16.*#4}}
341  %xS32x4 = alloca <4 x i32>, align 16
342  %__a = alloca <4 x i32>, align 16
343  %__b = alloca <4 x i32>, align 16
344  %tmp = load <4 x i32>, <4 x i32>* %xS32x4, align 16
345  store <4 x i32> %tmp, <4 x i32>* %__a, align 16
346  %tmp1 = load <4 x i32>, <4 x i32>* %xS32x4, align 16
347  store <4 x i32> %tmp1, <4 x i32>* %__b, align 16
348  %tmp2 = load <4 x i32>, <4 x i32>* %__a, align 16
349  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
350  %tmp4 = load <4 x i32>, <4 x i32>* %__b, align 16
351  %tmp5 = bitcast <4 x i32> %tmp4 to <16 x i8>
352  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x i32>
353  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x i32>
354  %vext = shufflevector <4 x i32> %tmp6, <4 x i32> %tmp7, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
355  store <4 x i32> %vext, <4 x i32>* %xS32x4, align 16
356  ret void
357}
358
359define void @test_vextq_u32() nounwind ssp {
360  ; CHECK-LABEL: test_vextq_u32:
361  ; CHECK: {{ext.16.*#8}}
362  %xU32x4 = alloca <4 x i32>, align 16
363  %__a = alloca <4 x i32>, align 16
364  %__b = alloca <4 x i32>, align 16
365  %tmp = load <4 x i32>, <4 x i32>* %xU32x4, align 16
366  store <4 x i32> %tmp, <4 x i32>* %__a, align 16
367  %tmp1 = load <4 x i32>, <4 x i32>* %xU32x4, align 16
368  store <4 x i32> %tmp1, <4 x i32>* %__b, align 16
369  %tmp2 = load <4 x i32>, <4 x i32>* %__a, align 16
370  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
371  %tmp4 = load <4 x i32>, <4 x i32>* %__b, align 16
372  %tmp5 = bitcast <4 x i32> %tmp4 to <16 x i8>
373  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x i32>
374  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x i32>
375  %vext = shufflevector <4 x i32> %tmp6, <4 x i32> %tmp7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
376  store <4 x i32> %vext, <4 x i32>* %xU32x4, align 16
377  ret void
378}
379
380define void @test_vextq_f32() nounwind ssp {
381  ; CHECK-LABEL: test_vextq_f32:
382  ; CHECK: {{ext.16.*#12}}
383  %xF32x4 = alloca <4 x float>, align 16
384  %__a = alloca <4 x float>, align 16
385  %__b = alloca <4 x float>, align 16
386  %tmp = load <4 x float>, <4 x float>* %xF32x4, align 16
387  store <4 x float> %tmp, <4 x float>* %__a, align 16
388  %tmp1 = load <4 x float>, <4 x float>* %xF32x4, align 16
389  store <4 x float> %tmp1, <4 x float>* %__b, align 16
390  %tmp2 = load <4 x float>, <4 x float>* %__a, align 16
391  %tmp3 = bitcast <4 x float> %tmp2 to <16 x i8>
392  %tmp4 = load <4 x float>, <4 x float>* %__b, align 16
393  %tmp5 = bitcast <4 x float> %tmp4 to <16 x i8>
394  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x float>
395  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x float>
396  %vext = shufflevector <4 x float> %tmp6, <4 x float> %tmp7, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
397  store <4 x float> %vext, <4 x float>* %xF32x4, align 16
398  ret void
399}
400
401define void @test_vextq_s64() nounwind ssp {
402  ; CHECK-LABEL: test_vextq_s64:
403  ; CHECK: {{ext.16.*#8}}
404  %xS64x2 = alloca <2 x i64>, align 16
405  %__a = alloca <2 x i64>, align 16
406  %__b = alloca <2 x i64>, align 16
407  %tmp = load <2 x i64>, <2 x i64>* %xS64x2, align 16
408  store <2 x i64> %tmp, <2 x i64>* %__a, align 16
409  %tmp1 = load <2 x i64>, <2 x i64>* %xS64x2, align 16
410  store <2 x i64> %tmp1, <2 x i64>* %__b, align 16
411  %tmp2 = load <2 x i64>, <2 x i64>* %__a, align 16
412  %tmp3 = bitcast <2 x i64> %tmp2 to <16 x i8>
413  %tmp4 = load <2 x i64>, <2 x i64>* %__b, align 16
414  %tmp5 = bitcast <2 x i64> %tmp4 to <16 x i8>
415  %tmp6 = bitcast <16 x i8> %tmp3 to <2 x i64>
416  %tmp7 = bitcast <16 x i8> %tmp5 to <2 x i64>
417  %vext = shufflevector <2 x i64> %tmp6, <2 x i64> %tmp7, <2 x i32> <i32 1, i32 2>
418  store <2 x i64> %vext, <2 x i64>* %xS64x2, align 16
419  ret void
420}
421
422define void @test_vextq_u64() nounwind ssp {
423  ; CHECK-LABEL: test_vextq_u64:
424  ; CHECK: {{ext.16.*#8}}
425  %xU64x2 = alloca <2 x i64>, align 16
426  %__a = alloca <2 x i64>, align 16
427  %__b = alloca <2 x i64>, align 16
428  %tmp = load <2 x i64>, <2 x i64>* %xU64x2, align 16
429  store <2 x i64> %tmp, <2 x i64>* %__a, align 16
430  %tmp1 = load <2 x i64>, <2 x i64>* %xU64x2, align 16
431  store <2 x i64> %tmp1, <2 x i64>* %__b, align 16
432  %tmp2 = load <2 x i64>, <2 x i64>* %__a, align 16
433  %tmp3 = bitcast <2 x i64> %tmp2 to <16 x i8>
434  %tmp4 = load <2 x i64>, <2 x i64>* %__b, align 16
435  %tmp5 = bitcast <2 x i64> %tmp4 to <16 x i8>
436  %tmp6 = bitcast <16 x i8> %tmp3 to <2 x i64>
437  %tmp7 = bitcast <16 x i8> %tmp5 to <2 x i64>
438  %vext = shufflevector <2 x i64> %tmp6, <2 x i64> %tmp7, <2 x i32> <i32 1, i32 2>
439  store <2 x i64> %vext, <2 x i64>* %xU64x2, align 16
440  ret void
441}
442
443; shuffles with an undef second operand can use an EXT also so long as the
444; indices wrap and stay sequential.
445; rdar://12051674
446define <16 x i8> @vext1(<16 x i8> %_a) nounwind {
447; CHECK-LABEL: vext1:
448; CHECK: ext.16b  v0, v0, v0, #8
449  %vext = shufflevector <16 x i8> %_a, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
450  ret <16 x i8> %vext
451}
452
453; <rdar://problem/12212062>
454define <2 x i64> @vext2(<2 x i64> %p0, <2 x i64> %p1) nounwind readnone ssp {
455entry:
456; CHECK-LABEL: vext2:
457; CHECK: ext.16b v1, v1, v1, #8
458; CHECK: ext.16b v0, v0, v0, #8
459; CHECK: add.2d  v0, v0, v1
460  %t0 = shufflevector <2 x i64> %p1, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
461  %t1 = shufflevector <2 x i64> %p0, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
462  %t2 = add <2 x i64> %t1, %t0
463  ret <2 x i64> %t2
464}
465