1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
3
4define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
5; CHECK-LABEL: vuzpi8:
6; CHECK:       @ %bb.0:
7; CHECK-NEXT:    vldr d16, [r1]
8; CHECK-NEXT:    vldr d17, [r0]
9; CHECK-NEXT:    vuzp.8 d17, d16
10; CHECK-NEXT:    vmul.i8 d16, d17, d16
11; CHECK-NEXT:    vmov r0, r1, d16
12; CHECK-NEXT:    mov pc, lr
13	%tmp1 = load <8 x i8>, <8 x i8>* %A
14	%tmp2 = load <8 x i8>, <8 x i8>* %B
15	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
16	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
17        %tmp5 = mul <8 x i8> %tmp3, %tmp4
18	ret <8 x i8> %tmp5
19}
20
21define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
22; CHECK-LABEL: vuzpi8_Qres:
23; CHECK:       @ %bb.0:
24; CHECK-NEXT:    vldr d17, [r1]
25; CHECK-NEXT:    vldr d16, [r0]
26; CHECK-NEXT:    vuzp.8 d16, d17
27; CHECK-NEXT:    vmov r0, r1, d16
28; CHECK-NEXT:    vmov r2, r3, d17
29; CHECK-NEXT:    mov pc, lr
30	%tmp1 = load <8 x i8>, <8 x i8>* %A
31	%tmp2 = load <8 x i8>, <8 x i8>* %B
32	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
33	ret <16 x i8> %tmp3
34}
35
36define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
37; CHECK-LABEL: vuzpi16:
38; CHECK:       @ %bb.0:
39; CHECK-NEXT:    vldr d16, [r1]
40; CHECK-NEXT:    vldr d17, [r0]
41; CHECK-NEXT:    vuzp.16 d17, d16
42; CHECK-NEXT:    vmul.i16 d16, d17, d16
43; CHECK-NEXT:    vmov r0, r1, d16
44; CHECK-NEXT:    mov pc, lr
45	%tmp1 = load <4 x i16>, <4 x i16>* %A
46	%tmp2 = load <4 x i16>, <4 x i16>* %B
47	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
48	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
49        %tmp5 = mul <4 x i16> %tmp3, %tmp4
50	ret <4 x i16> %tmp5
51}
52
53define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind {
54; CHECK-LABEL: vuzpi16_Qres:
55; CHECK:       @ %bb.0:
56; CHECK-NEXT:    vldr d17, [r1]
57; CHECK-NEXT:    vldr d16, [r0]
58; CHECK-NEXT:    vuzp.16 d16, d17
59; CHECK-NEXT:    vmov r0, r1, d16
60; CHECK-NEXT:    vmov r2, r3, d17
61; CHECK-NEXT:    mov pc, lr
62	%tmp1 = load <4 x i16>, <4 x i16>* %A
63	%tmp2 = load <4 x i16>, <4 x i16>* %B
64	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
65	ret <8 x i16> %tmp3
66}
67
68; VUZP.32 is equivalent to VTRN.32 for 64-bit vectors.
69
70define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
71; CHECK-LABEL: vuzpQi8:
72; CHECK:       @ %bb.0:
73; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
74; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
75; CHECK-NEXT:    vuzp.8 q9, q8
76; CHECK-NEXT:    vadd.i8 q8, q9, q8
77; CHECK-NEXT:    vmov r0, r1, d16
78; CHECK-NEXT:    vmov r2, r3, d17
79; CHECK-NEXT:    mov pc, lr
80	%tmp1 = load <16 x i8>, <16 x i8>* %A
81	%tmp2 = load <16 x i8>, <16 x i8>* %B
82	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
83	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
84        %tmp5 = add <16 x i8> %tmp3, %tmp4
85	ret <16 x i8> %tmp5
86}
87
88define <32 x i8> @vuzpQi8_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind {
89; CHECK-LABEL: vuzpQi8_QQres:
90; CHECK:       @ %bb.0:
91; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
92; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
93; CHECK-NEXT:    vuzp.8 q9, q8
94; CHECK-NEXT:    vst1.8 {d18, d19}, [r0:128]!
95; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
96; CHECK-NEXT:    mov pc, lr
97	%tmp1 = load <16 x i8>, <16 x i8>* %A
98	%tmp2 = load <16 x i8>, <16 x i8>* %B
99	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
100	ret <32 x i8> %tmp3
101}
102
103define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
104; CHECK-LABEL: vuzpQi16:
105; CHECK:       @ %bb.0:
106; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
107; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
108; CHECK-NEXT:    vuzp.16 q9, q8
109; CHECK-NEXT:    vadd.i16 q8, q9, q8
110; CHECK-NEXT:    vmov r0, r1, d16
111; CHECK-NEXT:    vmov r2, r3, d17
112; CHECK-NEXT:    mov pc, lr
113	%tmp1 = load <8 x i16>, <8 x i16>* %A
114	%tmp2 = load <8 x i16>, <8 x i16>* %B
115	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
116	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
117        %tmp5 = add <8 x i16> %tmp3, %tmp4
118	ret <8 x i16> %tmp5
119}
120
121define <16 x i16> @vuzpQi16_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind {
122; CHECK-LABEL: vuzpQi16_QQres:
123; CHECK:       @ %bb.0:
124; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
125; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
126; CHECK-NEXT:    vuzp.16 q9, q8
127; CHECK-NEXT:    vst1.16 {d18, d19}, [r0:128]!
128; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
129; CHECK-NEXT:    mov pc, lr
130	%tmp1 = load <8 x i16>, <8 x i16>* %A
131	%tmp2 = load <8 x i16>, <8 x i16>* %B
132	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
133	ret <16 x i16> %tmp3
134}
135
136define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
137; CHECK-LABEL: vuzpQi32:
138; CHECK:       @ %bb.0:
139; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
140; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
141; CHECK-NEXT:    vuzp.32 q9, q8
142; CHECK-NEXT:    vadd.i32 q8, q9, q8
143; CHECK-NEXT:    vmov r0, r1, d16
144; CHECK-NEXT:    vmov r2, r3, d17
145; CHECK-NEXT:    mov pc, lr
146	%tmp1 = load <4 x i32>, <4 x i32>* %A
147	%tmp2 = load <4 x i32>, <4 x i32>* %B
148	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
149	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
150        %tmp5 = add <4 x i32> %tmp3, %tmp4
151	ret <4 x i32> %tmp5
152}
153
154define <8 x i32> @vuzpQi32_QQres(<4 x i32>* %A, <4 x i32>* %B) nounwind {
155; CHECK-LABEL: vuzpQi32_QQres:
156; CHECK:       @ %bb.0:
157; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
158; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
159; CHECK-NEXT:    vuzp.32 q9, q8
160; CHECK-NEXT:    vst1.32 {d18, d19}, [r0:128]!
161; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
162; CHECK-NEXT:    mov pc, lr
163	%tmp1 = load <4 x i32>, <4 x i32>* %A
164	%tmp2 = load <4 x i32>, <4 x i32>* %B
165	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
166	ret <8 x i32> %tmp3
167}
168
169define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind {
170; CHECK-LABEL: vuzpQf:
171; CHECK:       @ %bb.0:
172; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
173; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
174; CHECK-NEXT:    vuzp.32 q9, q8
175; CHECK-NEXT:    vadd.f32 q8, q9, q8
176; CHECK-NEXT:    vmov r0, r1, d16
177; CHECK-NEXT:    vmov r2, r3, d17
178; CHECK-NEXT:    mov pc, lr
179	%tmp1 = load <4 x float>, <4 x float>* %A
180	%tmp2 = load <4 x float>, <4 x float>* %B
181	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
182	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
183        %tmp5 = fadd <4 x float> %tmp3, %tmp4
184	ret <4 x float> %tmp5
185}
186
187define <8 x float> @vuzpQf_QQres(<4 x float>* %A, <4 x float>* %B) nounwind {
188; CHECK-LABEL: vuzpQf_QQres:
189; CHECK:       @ %bb.0:
190; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
191; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
192; CHECK-NEXT:    vuzp.32 q9, q8
193; CHECK-NEXT:    vst1.32 {d18, d19}, [r0:128]!
194; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
195; CHECK-NEXT:    mov pc, lr
196	%tmp1 = load <4 x float>, <4 x float>* %A
197	%tmp2 = load <4 x float>, <4 x float>* %B
198	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
199	ret <8 x float> %tmp3
200}
201
202; Undef shuffle indices should not prevent matching to VUZP:
203
204define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
205; CHECK-LABEL: vuzpi8_undef:
206; CHECK:       @ %bb.0:
207; CHECK-NEXT:    vldr d16, [r1]
208; CHECK-NEXT:    vldr d17, [r0]
209; CHECK-NEXT:    vuzp.8 d17, d16
210; CHECK-NEXT:    vmul.i8 d16, d17, d16
211; CHECK-NEXT:    vmov r0, r1, d16
212; CHECK-NEXT:    mov pc, lr
213	%tmp1 = load <8 x i8>, <8 x i8>* %A
214	%tmp2 = load <8 x i8>, <8 x i8>* %B
215	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>
216	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
217        %tmp5 = mul <8 x i8> %tmp3, %tmp4
218	ret <8 x i8> %tmp5
219}
220
221define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
222; CHECK-LABEL: vuzpi8_undef_Qres:
223; CHECK:       @ %bb.0:
224; CHECK-NEXT:    vldr d17, [r1]
225; CHECK-NEXT:    vldr d16, [r0]
226; CHECK-NEXT:    vuzp.8 d16, d17
227; CHECK-NEXT:    vmov r0, r1, d16
228; CHECK-NEXT:    vmov r2, r3, d17
229; CHECK-NEXT:    mov pc, lr
230	%tmp1 = load <8 x i8>, <8 x i8>* %A
231	%tmp2 = load <8 x i8>, <8 x i8>* %B
232	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
233	ret <16 x i8> %tmp3
234}
235
236define <8 x i16> @vuzpQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
237; CHECK-LABEL: vuzpQi16_undef:
238; CHECK:       @ %bb.0:
239; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
240; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
241; CHECK-NEXT:    vuzp.16 q9, q8
242; CHECK-NEXT:    vadd.i16 q8, q9, q8
243; CHECK-NEXT:    vmov r0, r1, d16
244; CHECK-NEXT:    vmov r2, r3, d17
245; CHECK-NEXT:    mov pc, lr
246	%tmp1 = load <8 x i16>, <8 x i16>* %A
247	%tmp2 = load <8 x i16>, <8 x i16>* %B
248	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14>
249	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
250        %tmp5 = add <8 x i16> %tmp3, %tmp4
251	ret <8 x i16> %tmp5
252}
253
254define <16 x i16> @vuzpQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind {
255; CHECK-LABEL: vuzpQi16_undef_QQres:
256; CHECK:       @ %bb.0:
257; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
258; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
259; CHECK-NEXT:    vuzp.16 q9, q8
260; CHECK-NEXT:    vst1.16 {d18, d19}, [r0:128]!
261; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
262; CHECK-NEXT:    mov pc, lr
263	%tmp1 = load <8 x i16>, <8 x i16>* %A
264	%tmp2 = load <8 x i16>, <8 x i16>* %B
265	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
266	ret <16 x i16> %tmp3
267}
268
269define <8 x i16> @vuzp_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) {
270; CHECK-LABEL: vuzp_lower_shufflemask_undef:
271; CHECK:       @ %bb.0: @ %entry
272; CHECK-NEXT:    vldr d17, [r1]
273; CHECK-NEXT:    vldr d18, [r0]
274; CHECK-NEXT:    vuzp.16 d18, d17
275; CHECK-NEXT:    vmov r0, r1, d16
276; CHECK-NEXT:    vmov r2, r3, d17
277; CHECK-NEXT:    mov pc, lr
278entry:
279	%tmp1 = load <4 x i16>, <4 x i16>* %A
280	%tmp2 = load <4 x i16>, <4 x i16>* %B
281  %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
282  ret <8 x i16> %0
283}
284
285define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) {
286; CHECK-LABEL: vuzp_lower_shufflemask_zeroed:
287; CHECK:       @ %bb.0: @ %entry
288; CHECK-NEXT:    vldr d18, [r0]
289; CHECK-NEXT:    vorr d19, d18, d18
290; CHECK-NEXT:    vldr d17, [r1]
291; CHECK-NEXT:    vtrn.32 d19, d17
292; CHECK-NEXT:    vdup.32 d16, d18[0]
293; CHECK-NEXT:    vmov r2, r3, d17
294; CHECK-NEXT:    vmov r0, r1, d16
295; CHECK-NEXT:    mov pc, lr
296entry:
297  %tmp1 = load <2 x i32>, <2 x i32>* %A
298	%tmp2 = load <2 x i32>, <2 x i32>* %B
299  %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 0, i32 1, i32 3>
300  ret <4 x i32> %0
301}
302
303define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) {
304; CHECK-LABEL: vuzp_rev_shufflemask_vtrn:
305; CHECK:       @ %bb.0: @ %entry
306; CHECK-NEXT:    vldr d16, [r1]
307; CHECK-NEXT:    vldr d17, [r0]
308; CHECK-NEXT:    vtrn.32 d17, d16
309; CHECK-NEXT:    vst1.64 {d16, d17}, [r2]
310; CHECK-NEXT:    mov pc, lr
311entry:
312  %tmp1 = load <2 x i32>, <2 x i32>* %A
313  %tmp2 = load <2 x i32>, <2 x i32>* %B
314  %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
315  store <4 x i32> %0, <4 x i32>* %C
316  ret void
317}
318
319define <8 x i8> @cmpsel_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) {
320; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8.
321; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to
322; truncate from i32 to i16 and one vmovn.i16 to perform the final truncation for i8.
323; CHECK-LABEL: cmpsel_trunc:
324; CHECK:       @ %bb.0:
325; CHECK-NEXT:    add r12, sp, #16
326; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
327; CHECK-NEXT:    mov r12, sp
328; CHECK-NEXT:    vld1.64 {d18, d19}, [r12]
329; CHECK-NEXT:    add r12, sp, #48
330; CHECK-NEXT:    vld1.64 {d20, d21}, [r12]
331; CHECK-NEXT:    add r12, sp, #32
332; CHECK-NEXT:    vcgt.u32 q8, q10, q8
333; CHECK-NEXT:    vld1.64 {d20, d21}, [r12]
334; CHECK-NEXT:    vcgt.u32 q9, q10, q9
335; CHECK-NEXT:    vmov d20, r2, r3
336; CHECK-NEXT:    vmovn.i32 d17, q8
337; CHECK-NEXT:    vmovn.i32 d16, q9
338; CHECK-NEXT:    vmov d18, r0, r1
339; CHECK-NEXT:    vmovn.i16 d16, q8
340; CHECK-NEXT:    vbsl d16, d18, d20
341; CHECK-NEXT:    vmov r0, r1, d16
342; CHECK-NEXT:    mov pc, lr
343  %c = icmp ult <8 x i32> %cmp0, %cmp1
344  %res = select <8 x i1> %c, <8 x i8> %in0, <8 x i8> %in1
345  ret <8 x i8> %res
346}
347
348; Shuffle the result from the compare with a <4 x i8>.
349; We need to extend the loaded <4 x i8> to <4 x i16>. Otherwise we wouldn't be able
350; to perform the vuzp and get the vbsl mask.
351define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1,
352; CHECK-LABEL: vuzp_trunc_and_shuffle:
353; CHECK:       @ %bb.0:
354; CHECK-NEXT:    .save {r11, lr}
355; CHECK-NEXT:    push {r11, lr}
356; CHECK-NEXT:    add r12, sp, #8
357; CHECK-NEXT:    add lr, sp, #24
358; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
359; CHECK-NEXT:    ldr r12, [sp, #40]
360; CHECK-NEXT:    vld1.64 {d18, d19}, [lr]
361; CHECK-NEXT:    vcgt.u32 q8, q9, q8
362; CHECK-NEXT:    vld1.32 {d18[0]}, [r12:32]
363; CHECK-NEXT:    vmovl.u8 q9, d18
364; CHECK-NEXT:    vmovn.i32 d16, q8
365; CHECK-NEXT:    vmov d17, r2, r3
366; CHECK-NEXT:    vuzp.8 d16, d18
367; CHECK-NEXT:    vmov d18, r0, r1
368; CHECK-NEXT:    vshl.i8 d16, d16, #7
369; CHECK-NEXT:    vshr.s8 d16, d16, #7
370; CHECK-NEXT:    vbsl d16, d18, d17
371; CHECK-NEXT:    vmov r0, r1, d16
372; CHECK-NEXT:    pop {r11, lr}
373; CHECK-NEXT:    mov pc, lr
374                         <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
375  %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
376  %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
377  %c0 = icmp ult <4 x i32> %cmp0, %cmp1
378  %c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
379  %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
380  ret <8 x i8> %rv
381}
382
383; Use an undef value for the <4 x i8> that is being shuffled with the compare result.
384; This produces a build_vector with some of the operands undefs.
385define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1,
386; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right:
387; CHECK:       @ %bb.0:
388; CHECK-NEXT:    mov r12, sp
389; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
390; CHECK-NEXT:    add r12, sp, #16
391; CHECK-NEXT:    vld1.64 {d18, d19}, [r12]
392; CHECK-NEXT:    vcgt.u32 q8, q9, q8
393; CHECK-NEXT:    vmov d18, r0, r1
394; CHECK-NEXT:    vmovn.i32 d16, q8
395; CHECK-NEXT:    vuzp.8 d16, d17
396; CHECK-NEXT:    vmov d17, r2, r3
397; CHECK-NEXT:    vshl.i8 d16, d16, #7
398; CHECK-NEXT:    vshr.s8 d16, d16, #7
399; CHECK-NEXT:    vbsl d16, d18, d17
400; CHECK-NEXT:    vmov r0, r1, d16
401; CHECK-NEXT:    mov pc, lr
402                         <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
403  %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
404  %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
405  %c0 = icmp ult <4 x i32> %cmp0, %cmp1
406  %c = shufflevector <4 x i1> %c0, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
407  %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
408  ret <8 x i8> %rv
409}
410
411define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1,
412; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left:
413; CHECK:       @ %bb.0:
414; CHECK-NEXT:    mov r12, sp
415; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
416; CHECK-NEXT:    add r12, sp, #16
417; CHECK-NEXT:    vld1.64 {d18, d19}, [r12]
418; CHECK-NEXT:    vcgt.u32 q8, q9, q8
419; CHECK-NEXT:    vldr d18, .LCPI22_0
420; CHECK-NEXT:    vmovn.i32 d16, q8
421; CHECK-NEXT:    vtbl.8 d16, {d16}, d18
422; CHECK-NEXT:    vmov d17, r2, r3
423; CHECK-NEXT:    vmov d18, r0, r1
424; CHECK-NEXT:    vshl.i8 d16, d16, #7
425; CHECK-NEXT:    vshr.s8 d16, d16, #7
426; CHECK-NEXT:    vbsl d16, d18, d17
427; CHECK-NEXT:    vmov r0, r1, d16
428; CHECK-NEXT:    mov pc, lr
429; CHECK-NEXT:    .p2align 3
430; CHECK-NEXT:  @ %bb.1:
431; CHECK-NEXT:  .LCPI22_0:
432; CHECK-NEXT:    .byte 255 @ 0xff
433; CHECK-NEXT:    .byte 255 @ 0xff
434; CHECK-NEXT:    .byte 255 @ 0xff
435; CHECK-NEXT:    .byte 255 @ 0xff
436; CHECK-NEXT:    .byte 0 @ 0x0
437; CHECK-NEXT:    .byte 2 @ 0x2
438; CHECK-NEXT:    .byte 4 @ 0x4
439; CHECK-NEXT:    .byte 6 @ 0x6
440                         <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
441  %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
442  %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
443  %c0 = icmp ult <4 x i32> %cmp0, %cmp1
444  %c = shufflevector <4 x i1> undef, <4 x i1> %c0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
445  %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
446  ret <8 x i8> %rv
447}
448
449; We're using large data types here, and we have to fill with undef values until we
450; get some vector size that we can represent.
451define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1,
452; CHECK-LABEL: vuzp_wide_type:
453; CHECK:       @ %bb.0:
454; CHECK-NEXT:    .save {r4, lr}
455; CHECK-NEXT:    push {r4, lr}
456; CHECK-NEXT:    add r12, sp, #32
457; CHECK-NEXT:    add lr, sp, #48
458; CHECK-NEXT:    vld1.32 {d17[0]}, [r12:32]
459; CHECK-NEXT:    add r12, sp, #24
460; CHECK-NEXT:    vld1.32 {d16[0]}, [r12:32]
461; CHECK-NEXT:    add r12, sp, #56
462; CHECK-NEXT:    vld1.32 {d19[0]}, [r12:32]
463; CHECK-NEXT:    vld1.32 {d18[0]}, [lr:32]
464; CHECK-NEXT:    add lr, sp, #40
465; CHECK-NEXT:    vld1.32 {d20[0]}, [lr:32]
466; CHECK-NEXT:    ldr r12, [sp, #68]
467; CHECK-NEXT:    ldr r4, [r12]
468; CHECK-NEXT:    vmov.32 d23[0], r4
469; CHECK-NEXT:    add r4, sp, #64
470; CHECK-NEXT:    vld1.32 {d24[0]}, [r4:32]
471; CHECK-NEXT:    add r4, sp, #36
472; CHECK-NEXT:    vcgt.u32 q10, q12, q10
473; CHECK-NEXT:    vld1.32 {d17[1]}, [r4:32]
474; CHECK-NEXT:    add r4, sp, #28
475; CHECK-NEXT:    vld1.32 {d16[1]}, [r4:32]
476; CHECK-NEXT:    add r4, sp, #60
477; CHECK-NEXT:    vld1.32 {d19[1]}, [r4:32]
478; CHECK-NEXT:    add r4, sp, #52
479; CHECK-NEXT:    vld1.32 {d18[1]}, [r4:32]
480; CHECK-NEXT:    add r4, r12, #4
481; CHECK-NEXT:    vcgt.u32 q8, q9, q8
482; CHECK-NEXT:    vmovn.i32 d19, q10
483; CHECK-NEXT:    vmov.u8 lr, d23[3]
484; CHECK-NEXT:    vmovn.i32 d18, q8
485; CHECK-NEXT:    vmovn.i16 d22, q9
486; CHECK-NEXT:    vldr d18, .LCPI23_0
487; CHECK-NEXT:    vmov.8 d17[0], lr
488; CHECK-NEXT:    vtbl.8 d16, {d22, d23}, d18
489; CHECK-NEXT:    vmov d19, r2, r3
490; CHECK-NEXT:    vld1.8 {d17[1]}, [r4]
491; CHECK-NEXT:    add r4, sp, #8
492; CHECK-NEXT:    vmov d18, r0, r1
493; CHECK-NEXT:    vshl.i8 q8, q8, #7
494; CHECK-NEXT:    vld1.64 {d20, d21}, [r4]
495; CHECK-NEXT:    vshr.s8 q8, q8, #7
496; CHECK-NEXT:    vbsl q8, q9, q10
497; CHECK-NEXT:    vmov r0, r1, d16
498; CHECK-NEXT:    vmov r2, r3, d17
499; CHECK-NEXT:    pop {r4, lr}
500; CHECK-NEXT:    mov pc, lr
501; CHECK-NEXT:    .p2align 3
502; CHECK-NEXT:  @ %bb.1:
503; CHECK-NEXT:  .LCPI23_0:
504; CHECK-NEXT:    .byte 0 @ 0x0
505; CHECK-NEXT:    .byte 1 @ 0x1
506; CHECK-NEXT:    .byte 2 @ 0x2
507; CHECK-NEXT:    .byte 3 @ 0x3
508; CHECK-NEXT:    .byte 4 @ 0x4
509; CHECK-NEXT:    .byte 8 @ 0x8
510; CHECK-NEXT:    .byte 9 @ 0x9
511; CHECK-NEXT:    .byte 10 @ 0xa
512                            <5 x i32> %cmp0, <5 x i32> %cmp1, <5 x i8> *%cmp2_ptr) {
513  %cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4
514  %cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1>
515  %c0 = icmp ult <5 x i32> %cmp0, %cmp1
516  %c = shufflevector <5 x i1> %c0, <5 x i1> %cmp2, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
517  %rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1
518  ret <10 x i8> %rv
519}
520
521%struct.uint8x8x2_t = type { [2 x <8 x i8>] }
522define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 {
523; CHECK-LABEL: vuzp_extract_subvector:
524; CHECK:       @ %bb.0:
525; CHECK-NEXT:    vmov d16, r2, r3
526; CHECK-NEXT:    vmov d17, r0, r1
527; CHECK-NEXT:    vuzp.8 d17, d16
528; CHECK-NEXT:    vmov r0, r1, d17
529; CHECK-NEXT:    vmov r2, r3, d16
530; CHECK-NEXT:    mov pc, lr
531
532  %vuzp.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
533  %vuzp1.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
534  %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0
535  %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
536  ret %struct.uint8x8x2_t %.fca.0.1.insert
537}
538