1; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
2
3define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
4; CHECK-LABEL: vuzpi8:
5; CHECK:       @ BB#0:
6; CHECK-NEXT:    vldr d16, [r1]
7; CHECK-NEXT:    vldr d17, [r0]
8; CHECK-NEXT:    vuzp.8 d17, d16
9; CHECK-NEXT:    vadd.i8 d16, d17, d16
10; CHECK-NEXT:    vmov r0, r1, d16
11; CHECK-NEXT:    mov pc, lr
12	%tmp1 = load <8 x i8>, <8 x i8>* %A
13	%tmp2 = load <8 x i8>, <8 x i8>* %B
14	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
15	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
16        %tmp5 = add <8 x i8> %tmp3, %tmp4
17	ret <8 x i8> %tmp5
18}
19
20define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
21; CHECK-LABEL: vuzpi8_Qres:
22; CHECK:       @ BB#0:
23; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
24; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
25; CHECK-NEXT:    vuzp.8 [[LDR0]], [[LDR1]]
26; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
27; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
28; CHECK-NEXT:    mov pc, lr
29	%tmp1 = load <8 x i8>, <8 x i8>* %A
30	%tmp2 = load <8 x i8>, <8 x i8>* %B
31	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
32	ret <16 x i8> %tmp3
33}
34
35define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
36; CHECK-LABEL: vuzpi16:
37; CHECK:       @ BB#0:
38; CHECK-NEXT:    vldr d16, [r1]
39; CHECK-NEXT:    vldr d17, [r0]
40; CHECK-NEXT:    vuzp.16 d17, d16
41; CHECK-NEXT:    vadd.i16 d16, d17, d16
42; CHECK-NEXT:    vmov r0, r1, d16
43; CHECK-NEXT:    mov pc, lr
44	%tmp1 = load <4 x i16>, <4 x i16>* %A
45	%tmp2 = load <4 x i16>, <4 x i16>* %B
46	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
47	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
48        %tmp5 = add <4 x i16> %tmp3, %tmp4
49	ret <4 x i16> %tmp5
50}
51
52define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind {
53; CHECK-LABEL: vuzpi16_Qres:
54; CHECK:       @ BB#0:
55; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
56; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
57; CHECK-NEXT:    vuzp.16 [[LDR0]], [[LDR1]]
58; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
59; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
60; CHECK-NEXT:    mov pc, lr
61	%tmp1 = load <4 x i16>, <4 x i16>* %A
62	%tmp2 = load <4 x i16>, <4 x i16>* %B
63	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
64	ret <8 x i16> %tmp3
65}
66
67; VUZP.32 is equivalent to VTRN.32 for 64-bit vectors.
68
69define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
70; CHECK-LABEL: vuzpQi8:
71; CHECK:       @ BB#0:
72; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
73; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
74; CHECK-NEXT:    vuzp.8 q9, q8
75; CHECK-NEXT:    vadd.i8 q8, q9, q8
76; CHECK-NEXT:    vmov r0, r1, d16
77; CHECK-NEXT:    vmov r2, r3, d17
78; CHECK-NEXT:    mov pc, lr
79	%tmp1 = load <16 x i8>, <16 x i8>* %A
80	%tmp2 = load <16 x i8>, <16 x i8>* %B
81	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
82	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
83        %tmp5 = add <16 x i8> %tmp3, %tmp4
84	ret <16 x i8> %tmp5
85}
86
87define <32 x i8> @vuzpQi8_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind {
88; CHECK-LABEL: vuzpQi8_QQres:
89; CHECK:       @ BB#0:
90; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
91; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
92; CHECK-NEXT:    vuzp.8 q9, q8
93; CHECK-NEXT:    vst1.8 {d18, d19}, [r0:128]!
94; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
95; CHECK-NEXT:    mov pc, lr
96	%tmp1 = load <16 x i8>, <16 x i8>* %A
97	%tmp2 = load <16 x i8>, <16 x i8>* %B
98	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
99	ret <32 x i8> %tmp3
100}
101
102define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
103; CHECK-LABEL: vuzpQi16:
104; CHECK:       @ BB#0:
105; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
106; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
107; CHECK-NEXT:    vuzp.16 q9, q8
108; CHECK-NEXT:    vadd.i16 q8, q9, q8
109; CHECK-NEXT:    vmov r0, r1, d16
110; CHECK-NEXT:    vmov r2, r3, d17
111; CHECK-NEXT:    mov pc, lr
112	%tmp1 = load <8 x i16>, <8 x i16>* %A
113	%tmp2 = load <8 x i16>, <8 x i16>* %B
114	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
115	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
116        %tmp5 = add <8 x i16> %tmp3, %tmp4
117	ret <8 x i16> %tmp5
118}
119
120define <16 x i16> @vuzpQi16_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind {
121; CHECK-LABEL: vuzpQi16_QQres:
122; CHECK:       @ BB#0:
123; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
124; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
125; CHECK-NEXT:    vuzp.16 q9, q8
126; CHECK-NEXT:    vst1.16 {d18, d19}, [r0:128]!
127; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
128; CHECK-NEXT:    mov pc, lr
129	%tmp1 = load <8 x i16>, <8 x i16>* %A
130	%tmp2 = load <8 x i16>, <8 x i16>* %B
131	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
132	ret <16 x i16> %tmp3
133}
134
135define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
136; CHECK-LABEL: vuzpQi32:
137; CHECK:       @ BB#0:
138; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
139; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
140; CHECK-NEXT:    vuzp.32 q9, q8
141; CHECK-NEXT:    vadd.i32 q8, q9, q8
142; CHECK-NEXT:    vmov r0, r1, d16
143; CHECK-NEXT:    vmov r2, r3, d17
144; CHECK-NEXT:    mov pc, lr
145	%tmp1 = load <4 x i32>, <4 x i32>* %A
146	%tmp2 = load <4 x i32>, <4 x i32>* %B
147	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
148	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
149        %tmp5 = add <4 x i32> %tmp3, %tmp4
150	ret <4 x i32> %tmp5
151}
152
153define <8 x i32> @vuzpQi32_QQres(<4 x i32>* %A, <4 x i32>* %B) nounwind {
154; CHECK-LABEL: vuzpQi32_QQres:
155; CHECK:       @ BB#0:
156; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
157; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
158; CHECK-NEXT:    vuzp.32 q9, q8
159; CHECK-NEXT:    vst1.32 {d18, d19}, [r0:128]!
160; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
161; CHECK-NEXT:    mov pc, lr
162	%tmp1 = load <4 x i32>, <4 x i32>* %A
163	%tmp2 = load <4 x i32>, <4 x i32>* %B
164	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
165	ret <8 x i32> %tmp3
166}
167
168define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind {
169; CHECK-LABEL: vuzpQf:
170; CHECK:       @ BB#0:
171; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
172; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
173; CHECK-NEXT:    vuzp.32 q9, q8
174; CHECK-NEXT:    vadd.f32 q8, q9, q8
175; CHECK-NEXT:    vmov r0, r1, d16
176; CHECK-NEXT:    vmov r2, r3, d17
177; CHECK-NEXT:    mov pc, lr
178	%tmp1 = load <4 x float>, <4 x float>* %A
179	%tmp2 = load <4 x float>, <4 x float>* %B
180	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
181	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
182        %tmp5 = fadd <4 x float> %tmp3, %tmp4
183	ret <4 x float> %tmp5
184}
185
186define <8 x float> @vuzpQf_QQres(<4 x float>* %A, <4 x float>* %B) nounwind {
187; CHECK-LABEL: vuzpQf_QQres:
188; CHECK:       @ BB#0:
189; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
190; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
191; CHECK-NEXT:    vuzp.32 q9, q8
192; CHECK-NEXT:    vst1.32 {d18, d19}, [r0:128]!
193; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
194; CHECK-NEXT:    mov pc, lr
195	%tmp1 = load <4 x float>, <4 x float>* %A
196	%tmp2 = load <4 x float>, <4 x float>* %B
197	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
198	ret <8 x float> %tmp3
199}
200
201; Undef shuffle indices should not prevent matching to VUZP:
202
203define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
204; CHECK-LABEL: vuzpi8_undef:
205; CHECK:       @ BB#0:
206; CHECK-NEXT:    vldr d16, [r1]
207; CHECK-NEXT:    vldr d17, [r0]
208; CHECK-NEXT:    vuzp.8 d17, d16
209; CHECK-NEXT:    vadd.i8 d16, d17, d16
210; CHECK-NEXT:    vmov r0, r1, d16
211; CHECK-NEXT:    mov pc, lr
212	%tmp1 = load <8 x i8>, <8 x i8>* %A
213	%tmp2 = load <8 x i8>, <8 x i8>* %B
214	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>
215	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
216        %tmp5 = add <8 x i8> %tmp3, %tmp4
217	ret <8 x i8> %tmp5
218}
219
220define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
221; CHECK-LABEL: vuzpi8_undef_Qres:
222; CHECK:       @ BB#0:
223; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
224; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
225; CHECK-NEXT:    vuzp.8 [[LDR0]], [[LDR1]]
226; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
227; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
228; CHECK-NEXT:    mov pc, lr
229	%tmp1 = load <8 x i8>, <8 x i8>* %A
230	%tmp2 = load <8 x i8>, <8 x i8>* %B
231	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
232	ret <16 x i8> %tmp3
233}
234
235define <8 x i16> @vuzpQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
236; CHECK-LABEL: vuzpQi16_undef:
237; CHECK:       @ BB#0:
238; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
239; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
240; CHECK-NEXT:    vuzp.16 q9, q8
241; CHECK-NEXT:    vadd.i16 q8, q9, q8
242; CHECK-NEXT:    vmov r0, r1, d16
243; CHECK-NEXT:    vmov r2, r3, d17
244; CHECK-NEXT:    mov pc, lr
245	%tmp1 = load <8 x i16>, <8 x i16>* %A
246	%tmp2 = load <8 x i16>, <8 x i16>* %B
247	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14>
248	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
249        %tmp5 = add <8 x i16> %tmp3, %tmp4
250	ret <8 x i16> %tmp5
251}
252
253define <16 x i16> @vuzpQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind {
254; CHECK-LABEL: vuzpQi16_undef_QQres:
255; CHECK:       @ BB#0:
256; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
257; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
258; CHECK-NEXT:    vuzp.16 q9, q8
259; CHECK-NEXT:    vst1.16 {d18, d19}, [r0:128]!
260; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
261; CHECK-NEXT:    mov pc, lr
262	%tmp1 = load <8 x i16>, <8 x i16>* %A
263	%tmp2 = load <8 x i16>, <8 x i16>* %B
264	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
265	ret <16 x i16> %tmp3
266}
267
268define <8 x i16> @vuzp_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) {
269entry:
270  ; CHECK-LABEL: vuzp_lower_shufflemask_undef
271  ; CHECK: vuzp
272	%tmp1 = load <4 x i16>, <4 x i16>* %A
273	%tmp2 = load <4 x i16>, <4 x i16>* %B
274  %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
275  ret <8 x i16> %0
276}
277
278define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) {
279entry:
280  ; CHECK-LABEL: vuzp_lower_shufflemask_zeroed
281  ; CHECK-NOT: vtrn
282  ; CHECK: vuzp
283  %tmp1 = load <2 x i32>, <2 x i32>* %A
284	%tmp2 = load <2 x i32>, <2 x i32>* %B
285  %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 0, i32 1, i32 3>
286  ret <4 x i32> %0
287}
288
289define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) {
290entry:
291  ; CHECK-LABEL: vuzp_rev_shufflemask_vtrn
292  ; CHECK-NOT: vtrn
293  ; CHECK: vuzp
294  %tmp1 = load <2 x i32>, <2 x i32>* %A
295  %tmp2 = load <2 x i32>, <2 x i32>* %B
296  %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
297  store <4 x i32> %0, <4 x i32>* %C
298  ret void
299}
300
301define <8 x i8> @vuzp_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) {
302; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8.
303; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to
304; truncate from i32 to i16 and one vuzp to perform the final truncation for i8.
305; CHECK-LABEL: vuzp_trunc
306; CHECK: vmovn.i32
307; CHECK: vmovn.i32
308; CHECK: vuzp
309; CHECK: vbsl
310  %c = icmp ult <8 x i32> %cmp0, %cmp1
311  %res = select <8 x i1> %c, <8 x i8> %in0, <8 x i8> %in1
312  ret <8 x i8> %res
313}
314
315; Shuffle the result from the compare with a <4 x i8>.
316; We need to extend the loaded <4 x i8> to <4 x i16>. Otherwise we wouldn't be able
317; to perform the vuzp and get the vbsl mask.
318define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1,
319                         <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
320; CHECK-LABEL: vuzp_trunc_and_shuffle
321; CHECK: vmovl
322; CHECK: vuzp
323; CHECK: vbsl
324  %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
325  %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
326  %c0 = icmp ult <4 x i32> %cmp0, %cmp1
327  %c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
328  %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
329  ret <8 x i8> %rv
330}
331
332; Use an undef value for the <4 x i8> that is being shuffled with the compare result.
333; This produces a build_vector with some of the operands undefs.
334define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1,
335                         <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
336; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right
337; CHECK: vuzp
338; CHECK: vbsl
339  %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
340  %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
341  %c0 = icmp ult <4 x i32> %cmp0, %cmp1
342  %c = shufflevector <4 x i1> %c0, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
343  %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
344  ret <8 x i8> %rv
345}
346
347define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1,
348                         <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
349; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left
350; CHECK: vuzp
351; CHECK: vbsl
352  %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
353  %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
354  %c0 = icmp ult <4 x i32> %cmp0, %cmp1
355  %c = shufflevector <4 x i1> undef, <4 x i1> %c0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
356  %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
357  ret <8 x i8> %rv
358}
359
360; We're using large data types here, and we have to fill with undef values until we
361; get some vector size that we can represent.
362define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1,
363                            <5 x i32> %cmp0, <5 x i32> %cmp1, <5 x i8> *%cmp2_ptr) {
364; CHECK-LABEL: vuzp_wide_type
365; CHECK: vbsl
366  %cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4
367  %cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1>
368  %c0 = icmp ult <5 x i32> %cmp0, %cmp1
369  %c = shufflevector <5 x i1> %c0, <5 x i1> %cmp2, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
370  %rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1
371  ret <10 x i8> %rv
372}
373