1; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
2
3define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
4; CHECK-LABEL: vtrni8:
5; CHECK:       @ BB#0:
6; CHECK-NEXT:    vldr d16, [r1]
7; CHECK-NEXT:    vldr d17, [r0]
8; CHECK-NEXT:    vtrn.8 d17, d16
9; CHECK-NEXT:    vadd.i8 d16, d17, d16
10; CHECK-NEXT:    vmov r0, r1, d16
11; CHECK-NEXT:    mov pc, lr
12	%tmp1 = load <8 x i8>, <8 x i8>* %A
13	%tmp2 = load <8 x i8>, <8 x i8>* %B
14	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
15	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
16        %tmp5 = add <8 x i8> %tmp3, %tmp4
17	ret <8 x i8> %tmp5
18}
19
20define <16 x i8> @vtrni8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
21; CHECK-LABEL: vtrni8_Qres:
22; CHECK:       @ BB#0:
23; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
24; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
25; CHECK-NEXT:    vtrn.8 [[LDR0]], [[LDR1]]
26; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
27; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
28; CHECK-NEXT:    mov pc, lr
29	%tmp1 = load <8 x i8>, <8 x i8>* %A
30	%tmp2 = load <8 x i8>, <8 x i8>* %B
31	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14, i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
32	ret <16 x i8> %tmp3
33}
34
35define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
36; CHECK-LABEL: vtrni16:
37; CHECK:       @ BB#0:
38; CHECK-NEXT:    vldr d16, [r1]
39; CHECK-NEXT:    vldr d17, [r0]
40; CHECK-NEXT:    vtrn.16 d17, d16
41; CHECK-NEXT:    vadd.i16 d16, d17, d16
42; CHECK-NEXT:    vmov r0, r1, d16
43; CHECK-NEXT:    mov pc, lr
44	%tmp1 = load <4 x i16>, <4 x i16>* %A
45	%tmp2 = load <4 x i16>, <4 x i16>* %B
46	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
47	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
48        %tmp5 = add <4 x i16> %tmp3, %tmp4
49	ret <4 x i16> %tmp5
50}
51
52define <8 x i16> @vtrni16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind {
53; CHECK-LABEL: vtrni16_Qres:
54; CHECK:       @ BB#0:
55; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
56; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
57; CHECK-NEXT:    vtrn.16 [[LDR0]], [[LDR1]]
58; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
59; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
60; CHECK-NEXT:    mov pc, lr
61	%tmp1 = load <4 x i16>, <4 x i16>* %A
62	%tmp2 = load <4 x i16>, <4 x i16>* %B
63	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 4, i32 2, i32 6, i32 1, i32 5, i32 3, i32 7>
64	ret <8 x i16> %tmp3
65}
66
67define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
68; CHECK-LABEL: vtrni32:
69; CHECK:       @ BB#0:
70; CHECK-NEXT:    vldr d16, [r1]
71; CHECK-NEXT:    vldr d17, [r0]
72; CHECK-NEXT:    vtrn.32 d17, d16
73; CHECK-NEXT:    vadd.i32 d16, d17, d16
74; CHECK-NEXT:    vmov r0, r1, d16
75; CHECK-NEXT:    mov pc, lr
76	%tmp1 = load <2 x i32>, <2 x i32>* %A
77	%tmp2 = load <2 x i32>, <2 x i32>* %B
78	%tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 0, i32 2>
79	%tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 3>
80        %tmp5 = add <2 x i32> %tmp3, %tmp4
81	ret <2 x i32> %tmp5
82}
83
84define <4 x i32> @vtrni32_Qres(<2 x i32>* %A, <2 x i32>* %B) nounwind {
85; CHECK-LABEL: vtrni32_Qres:
86; CHECK:       @ BB#0:
87; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
88; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
89; CHECK-NEXT:    vtrn.32 [[LDR0]], [[LDR1]]
90; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
91; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
92; CHECK-NEXT:    mov pc, lr
93	%tmp1 = load <2 x i32>, <2 x i32>* %A
94	%tmp2 = load <2 x i32>, <2 x i32>* %B
95	%tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
96	ret <4 x i32> %tmp3
97}
98
99define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind {
100; CHECK-LABEL: vtrnf:
101; CHECK:       @ BB#0:
102; CHECK-NEXT:    vldr d16, [r1]
103; CHECK-NEXT:    vldr d17, [r0]
104; CHECK-NEXT:    vtrn.32 d17, d16
105; CHECK-NEXT:    vadd.f32 d16, d17, d16
106; CHECK-NEXT:    vmov r0, r1, d16
107; CHECK-NEXT:    mov pc, lr
108	%tmp1 = load <2 x float>, <2 x float>* %A
109	%tmp2 = load <2 x float>, <2 x float>* %B
110	%tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 0, i32 2>
111	%tmp4 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 1, i32 3>
112        %tmp5 = fadd <2 x float> %tmp3, %tmp4
113	ret <2 x float> %tmp5
114}
115
116define <4 x float> @vtrnf_Qres(<2 x float>* %A, <2 x float>* %B) nounwind {
117; CHECK-LABEL: vtrnf_Qres:
118; CHECK:       @ BB#0:
119; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
120; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
121; CHECK-NEXT:    vtrn.32 [[LDR0]], [[LDR1]]
122; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
123; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
124; CHECK-NEXT:    mov pc, lr
125	%tmp1 = load <2 x float>, <2 x float>* %A
126	%tmp2 = load <2 x float>, <2 x float>* %B
127	%tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
128	ret <4 x float> %tmp3
129}
130
131define <16 x i8> @vtrnQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
132; CHECK-LABEL: vtrnQi8:
133; CHECK:       @ BB#0:
134; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
135; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
136; CHECK-NEXT:    vtrn.8 q9, q8
137; CHECK-NEXT:    vadd.i8 q8, q9, q8
138; CHECK-NEXT:    vmov r0, r1, d16
139; CHECK-NEXT:    vmov r2, r3, d17
140; CHECK-NEXT:    mov pc, lr
141	%tmp1 = load <16 x i8>, <16 x i8>* %A
142	%tmp2 = load <16 x i8>, <16 x i8>* %B
143	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
144	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
145        %tmp5 = add <16 x i8> %tmp3, %tmp4
146	ret <16 x i8> %tmp5
147}
148
149define <32 x i8> @vtrnQi8_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind {
150; CHECK-LABEL: vtrnQi8_QQres:
151; CHECK:       @ BB#0:
152; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
153; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
154; CHECK-NEXT:    vtrn.8 q9, q8
155; CHECK-NEXT:    vst1.8 {d18, d19}, [r0:128]!
156; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
157; CHECK-NEXT:    mov pc, lr
158	%tmp1 = load <16 x i8>, <16 x i8>* %A
159	%tmp2 = load <16 x i8>, <16 x i8>* %B
160	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30, i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
161	ret <32 x i8> %tmp3
162}
163
164define <8 x i16> @vtrnQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
165; CHECK-LABEL: vtrnQi16:
166; CHECK:       @ BB#0:
167; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
168; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
169; CHECK-NEXT:    vtrn.16 q9, q8
170; CHECK-NEXT:    vadd.i16 q8, q9, q8
171; CHECK-NEXT:    vmov r0, r1, d16
172; CHECK-NEXT:    vmov r2, r3, d17
173; CHECK-NEXT:    mov pc, lr
174	%tmp1 = load <8 x i16>, <8 x i16>* %A
175	%tmp2 = load <8 x i16>, <8 x i16>* %B
176	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
177	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
178        %tmp5 = add <8 x i16> %tmp3, %tmp4
179	ret <8 x i16> %tmp5
180}
181
182define <16 x i16> @vtrnQi16_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind {
183; CHECK-LABEL: vtrnQi16_QQres:
184; CHECK:       @ BB#0:
185; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
186; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
187; CHECK-NEXT:    vtrn.16 q9, q8
188; CHECK-NEXT:    vst1.16 {d18, d19}, [r0:128]!
189; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
190; CHECK-NEXT:    mov pc, lr
191	%tmp1 = load <8 x i16>, <8 x i16>* %A
192	%tmp2 = load <8 x i16>, <8 x i16>* %B
193	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14, i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
194	ret <16 x i16> %tmp3
195}
196
197define <4 x i32> @vtrnQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
198; CHECK-LABEL: vtrnQi32:
199; CHECK:       @ BB#0:
200; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
201; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
202; CHECK-NEXT:    vtrn.32 q9, q8
203; CHECK-NEXT:    vadd.i32 q8, q9, q8
204; CHECK-NEXT:    vmov r0, r1, d16
205; CHECK-NEXT:    vmov r2, r3, d17
206; CHECK-NEXT:    mov pc, lr
207	%tmp1 = load <4 x i32>, <4 x i32>* %A
208	%tmp2 = load <4 x i32>, <4 x i32>* %B
209	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
210	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
211        %tmp5 = add <4 x i32> %tmp3, %tmp4
212	ret <4 x i32> %tmp5
213}
214
215define <8 x i32> @vtrnQi32_QQres(<4 x i32>* %A, <4 x i32>* %B) nounwind {
216; CHECK-LABEL: vtrnQi32_QQres:
217; CHECK:       @ BB#0:
218; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
219; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
220; CHECK-NEXT:    vtrn.32 q9, q8
221; CHECK-NEXT:    vst1.32 {d18, d19}, [r0:128]!
222; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
223; CHECK-NEXT:    mov pc, lr
224	%tmp1 = load <4 x i32>, <4 x i32>* %A
225	%tmp2 = load <4 x i32>, <4 x i32>* %B
226	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> <i32 0, i32 4, i32 2, i32 6, i32 1, i32 5, i32 3, i32 7>
227	ret <8 x i32> %tmp3
228}
229
230define <4 x float> @vtrnQf(<4 x float>* %A, <4 x float>* %B) nounwind {
231; CHECK-LABEL: vtrnQf:
232; CHECK:       @ BB#0:
233; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
234; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
235; CHECK-NEXT:    vtrn.32 q9, q8
236; CHECK-NEXT:    vadd.f32 q8, q9, q8
237; CHECK-NEXT:    vmov r0, r1, d16
238; CHECK-NEXT:    vmov r2, r3, d17
239; CHECK-NEXT:    mov pc, lr
240	%tmp1 = load <4 x float>, <4 x float>* %A
241	%tmp2 = load <4 x float>, <4 x float>* %B
242	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
243	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
244        %tmp5 = fadd <4 x float> %tmp3, %tmp4
245	ret <4 x float> %tmp5
246}
247
248define <8 x float> @vtrnQf_QQres(<4 x float>* %A, <4 x float>* %B) nounwind {
249; CHECK-LABEL: vtrnQf_QQres:
250; CHECK:       @ BB#0:
251; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
252; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
253; CHECK-NEXT:    vtrn.32 q9, q8
254; CHECK-NEXT:    vst1.32 {d18, d19}, [r0:128]!
255; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
256; CHECK-NEXT:    mov pc, lr
257	%tmp1 = load <4 x float>, <4 x float>* %A
258	%tmp2 = load <4 x float>, <4 x float>* %B
259	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <8 x i32> <i32 0, i32 4, i32 2, i32 6, i32 1, i32 5, i32 3, i32 7>
260	ret <8 x float> %tmp3
261}
262
263
264define <8 x i8> @vtrni8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
265; CHECK-LABEL: vtrni8_undef:
266; CHECK:       @ BB#0:
267; CHECK-NEXT:    vldr d16, [r1]
268; CHECK-NEXT:    vldr d17, [r0]
269; CHECK-NEXT:    vtrn.8 d17, d16
270; CHECK-NEXT:    vadd.i8 d16, d17, d16
271; CHECK-NEXT:    vmov r0, r1, d16
272; CHECK-NEXT:    mov pc, lr
273	%tmp1 = load <8 x i8>, <8 x i8>* %A
274	%tmp2 = load <8 x i8>, <8 x i8>* %B
275	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 undef, i32 2, i32 10, i32 undef, i32 12, i32 6, i32 14>
276	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 undef, i32 undef, i32 15>
277        %tmp5 = add <8 x i8> %tmp3, %tmp4
278	ret <8 x i8> %tmp5
279}
280
281define <16 x i8> @vtrni8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
282; CHECK-LABEL: vtrni8_undef_Qres:
283; CHECK:       @ BB#0:
284; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
285; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
286; CHECK-NEXT:    vtrn.8 [[LDR0]], [[LDR1]]
287; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
288; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
289; CHECK-NEXT:    mov pc, lr
290	%tmp1 = load <8 x i8>, <8 x i8>* %A
291	%tmp2 = load <8 x i8>, <8 x i8>* %B
292	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 undef, i32 2, i32 10, i32 undef, i32 12, i32 6, i32 14, i32 1, i32 9, i32 3, i32 11, i32 5, i32 undef, i32 undef, i32 15>
293	ret <16 x i8> %tmp3
294}
295
296define <8 x i16> @vtrnQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
297; CHECK-LABEL: vtrnQi16_undef:
298; CHECK:       @ BB#0:
299; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
300; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
301; CHECK-NEXT:    vtrn.16 q9, q8
302; CHECK-NEXT:    vadd.i16 q8, q9, q8
303; CHECK-NEXT:    vmov r0, r1, d16
304; CHECK-NEXT:    vmov r2, r3, d17
305; CHECK-NEXT:    mov pc, lr
306	%tmp1 = load <8 x i16>, <8 x i16>* %A
307	%tmp2 = load <8 x i16>, <8 x i16>* %B
308	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 6, i32 14>
309	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 undef, i32 3, i32 11, i32 5, i32 13, i32 undef, i32 undef>
310        %tmp5 = add <8 x i16> %tmp3, %tmp4
311	ret <8 x i16> %tmp5
312}
313
314define <16 x i16> @vtrnQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind {
315; CHECK-LABEL: vtrnQi16_undef_QQres:
316; CHECK:       @ BB#0:
317; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
318; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
319; CHECK-NEXT:    vtrn.16 q9, q8
320; CHECK-NEXT:    vst1.16 {d18, d19}, [r0:128]!
321; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
322; CHECK-NEXT:    mov pc, lr
323	%tmp1 = load <8 x i16>, <8 x i16>* %A
324	%tmp2 = load <8 x i16>, <8 x i16>* %B
325	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 6, i32 14, i32 1, i32 undef, i32 3, i32 11, i32 5, i32 13, i32 undef, i32 undef>
326	ret <16 x i16> %tmp3
327}
328
329define <8 x i16> @vtrn_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) {
330entry:
331  ; CHECK-LABEL: vtrn_lower_shufflemask_undef
332  ; CHECK: vtrn
333	%tmp1 = load <4 x i16>, <4 x i16>* %A
334	%tmp2 = load <4 x i16>, <4 x i16>* %B
335  %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 5, i32 3, i32 7>
336  ret <8 x i16> %0
337}
338
339; Here we get a build_vector node, where all the incoming extract_element
340; values do modify the type. However, we get different input types, as some of
341; them get truncated from i32 to i8 (from comparing cmp0 with cmp1) and some of
342; them get truncated from i16 to i8 (from comparing cmp2 with cmp3).
343define <8 x i8> @vtrn_mismatched_builvector0(<8 x i8> %tr0, <8 x i8> %tr1,
344                                             <4 x i32> %cmp0, <4 x i32> %cmp1,
345                                             <4 x i16> %cmp2, <4 x i16> %cmp3) {
346  ; CHECK-LABEL: vtrn_mismatched_builvector0
347  ; CHECK: vmovn.i32
348  ; CHECK: vtrn
349  ; CHECK: vbsl
350  %c0 = icmp ult <4 x i32> %cmp0, %cmp1
351  %c1 = icmp ult <4 x i16> %cmp2, %cmp3
352  %c = shufflevector <4 x i1> %c0, <4 x i1> %c1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
353  %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
354  ret <8 x i8> %rv
355}
356
357; Here we get a build_vector node, where half the incoming extract_element
358; values do not modify the type (the values form cmp2), but half of them do
359; (from the icmp operation).
360define <8 x i8> @vtrn_mismatched_builvector1(<8 x i8> %tr0, <8 x i8> %tr1,
361                           <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
362  ; CHECK-LABEL: vtrn_mismatched_builvector1
363  ; We need to extend the 4 x i8 to 4 x i16 in order to perform the vtrn
364  ; CHECK: vmovl
365  ; CHECK: vtrn.8
366  ; CHECK: vbsl
367  %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
368  %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
369  %c0 = icmp ult <4 x i32> %cmp0, %cmp1
370  %c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
371  %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
372  ret <8 x i8> %rv
373}
374
375; Negative test that should not generate a vtrn
376define void @lower_twice_no_vtrn(<4 x i16>* %A, <4 x i16>* %B, <8 x i16>* %C) {
377entry:
378  ; CHECK-LABEL: lower_twice_no_vtrn
379  ; CHECK: @ BB#0:
380  ; CHECK-NOT: vtrn
381  ; CHECK: mov pc, lr
382  %tmp1 = load <4 x i16>, <4 x i16>* %A
383  %tmp2 = load <4 x i16>, <4 x i16>* %B
384  %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 5, i32 3, i32 7, i32 1, i32 5, i32 3, i32 7>
385  store <8 x i16> %0, <8 x i16>* %C
386  ret void
387}
388
389; Negative test that should not generate a vtrn
390define void @upper_twice_no_vtrn(<4 x i16>* %A, <4 x i16>* %B, <8 x i16>* %C) {
391entry:
392  ; CHECK-LABEL: upper_twice_no_vtrn
393  ; CHECK: @ BB#0:
394  ; CHECK-NOT: vtrn
395  ; CHECK: mov pc, lr
396  %tmp1 = load <4 x i16>, <4 x i16>* %A
397  %tmp2 = load <4 x i16>, <4 x i16>* %B
398  %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 2, i32 6, i32 0, i32 4, i32 2, i32 6>
399  store <8 x i16> %0, <8 x i16>* %C
400  ret void
401}
402