1; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
2
3define <8 x i8> @addhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
4;CHECK-LABEL: addhn8b:
5;CHECK: addhn.8b
6        %tmp1 = load <8 x i16>, <8 x i16>* %A
7        %tmp2 = load <8 x i16>, <8 x i16>* %B
8        %tmp3 = call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
9        ret <8 x i8> %tmp3
10}
11
12define <4 x i16> @addhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
13;CHECK-LABEL: addhn4h:
14;CHECK: addhn.4h
15        %tmp1 = load <4 x i32>, <4 x i32>* %A
16        %tmp2 = load <4 x i32>, <4 x i32>* %B
17        %tmp3 = call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
18        ret <4 x i16> %tmp3
19}
20
21define <2 x i32> @addhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
22;CHECK-LABEL: addhn2s:
23;CHECK: addhn.2s
24        %tmp1 = load <2 x i64>, <2 x i64>* %A
25        %tmp2 = load <2 x i64>, <2 x i64>* %B
26        %tmp3 = call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
27        ret <2 x i32> %tmp3
28}
29
30define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
31;CHECK-LABEL: addhn2_16b:
32;CHECK: addhn.8b
33;CHECK-NEXT: addhn2.16b
34  %vaddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
35  %vaddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
36  %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
37  ret <16 x i8> %res
38}
39
40define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
41;CHECK-LABEL: addhn2_8h:
42;CHECK: addhn.4h
43;CHECK-NEXT: addhn2.8h
44  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
45  %vaddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
46  %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
47  ret <8 x i16> %res
48}
49
50define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
51;CHECK-LABEL: addhn2_4s:
52;CHECK: addhn.2s
53;CHECK-NEXT: addhn2.4s
54  %vaddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
55  %vaddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
56  %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
57  ret <4 x i32> %res
58}
59
60declare <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
61declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
62declare <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
63
64
65define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
66;CHECK-LABEL: raddhn8b:
67;CHECK: raddhn.8b
68        %tmp1 = load <8 x i16>, <8 x i16>* %A
69        %tmp2 = load <8 x i16>, <8 x i16>* %B
70        %tmp3 = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
71        ret <8 x i8> %tmp3
72}
73
74define <4 x i16> @raddhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
75;CHECK-LABEL: raddhn4h:
76;CHECK: raddhn.4h
77        %tmp1 = load <4 x i32>, <4 x i32>* %A
78        %tmp2 = load <4 x i32>, <4 x i32>* %B
79        %tmp3 = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
80        ret <4 x i16> %tmp3
81}
82
83define <2 x i32> @raddhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
84;CHECK-LABEL: raddhn2s:
85;CHECK: raddhn.2s
86        %tmp1 = load <2 x i64>, <2 x i64>* %A
87        %tmp2 = load <2 x i64>, <2 x i64>* %B
88        %tmp3 = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
89        ret <2 x i32> %tmp3
90}
91
92define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
93;CHECK-LABEL: raddhn2_16b:
94;CHECK: raddhn.8b
95;CHECK-NEXT: raddhn2.16b
96  %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
97  %vraddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
98  %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
99  ret <16 x i8> %res
100}
101
102define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
103;CHECK-LABEL: raddhn2_8h:
104;CHECK: raddhn.4h
105;CHECK-NEXT: raddhn2.8h
106  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
107  %vraddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
108  %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
109  ret <8 x i16> %res
110}
111
112define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
113;CHECK-LABEL: raddhn2_4s:
114;CHECK: raddhn.2s
115;CHECK-NEXT: raddhn2.4s
116  %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
117  %vraddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
118  %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
119  ret <4 x i32> %res
120}
121
122declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
123declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
124declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
125
126define <8 x i16> @saddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
127;CHECK-LABEL: saddl8h:
128;CHECK: saddl.8h
129        %tmp1 = load <8 x i8>, <8 x i8>* %A
130        %tmp2 = load <8 x i8>, <8 x i8>* %B
131  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
132  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
133  %tmp5 = add <8 x i16> %tmp3, %tmp4
134        ret <8 x i16> %tmp5
135}
136
137define <4 x i32> @saddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
138;CHECK-LABEL: saddl4s:
139;CHECK: saddl.4s
140        %tmp1 = load <4 x i16>, <4 x i16>* %A
141        %tmp2 = load <4 x i16>, <4 x i16>* %B
142  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
143  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
144  %tmp5 = add <4 x i32> %tmp3, %tmp4
145        ret <4 x i32> %tmp5
146}
147
148define <2 x i64> @saddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
149;CHECK-LABEL: saddl2d:
150;CHECK: saddl.2d
151        %tmp1 = load <2 x i32>, <2 x i32>* %A
152        %tmp2 = load <2 x i32>, <2 x i32>* %B
153  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
154  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
155  %tmp5 = add <2 x i64> %tmp3, %tmp4
156        ret <2 x i64> %tmp5
157}
158
159define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
160; CHECK-LABEL: saddl2_8h:
161; CHECK-NEXT: saddl2.8h v0, v0, v1
162; CHECK-NEXT: ret
163  %tmp = bitcast <16 x i8> %a to <2 x i64>
164  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
165  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
166  %vmovl.i.i.i = sext <8 x i8> %tmp1 to <8 x i16>
167  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
168  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
169  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
170  %vmovl.i.i5.i = sext <8 x i8> %tmp3 to <8 x i16>
171  %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
172  ret <8 x i16> %add.i
173}
174
175define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
176; CHECK-LABEL: saddl2_4s:
177; CHECK-NEXT: saddl2.4s v0, v0, v1
178; CHECK-NEXT: ret
179  %tmp = bitcast <8 x i16> %a to <2 x i64>
180  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
181  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
182  %vmovl.i.i.i = sext <4 x i16> %tmp1 to <4 x i32>
183  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
184  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
185  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
186  %vmovl.i.i5.i = sext <4 x i16> %tmp3 to <4 x i32>
187  %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
188  ret <4 x i32> %add.i
189}
190
191define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
192; CHECK-LABEL: saddl2_2d:
193; CHECK-NEXT: saddl2.2d v0, v0, v1
194; CHECK-NEXT: ret
195  %tmp = bitcast <4 x i32> %a to <2 x i64>
196  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
197  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
198  %vmovl.i.i.i = sext <2 x i32> %tmp1 to <2 x i64>
199  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
200  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
201  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
202  %vmovl.i.i5.i = sext <2 x i32> %tmp3 to <2 x i64>
203  %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
204  ret <2 x i64> %add.i
205}
206
207define <8 x i16> @uaddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
208;CHECK-LABEL: uaddl8h:
209;CHECK: uaddl.8h
210  %tmp1 = load <8 x i8>, <8 x i8>* %A
211  %tmp2 = load <8 x i8>, <8 x i8>* %B
212  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
213  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
214  %tmp5 = add <8 x i16> %tmp3, %tmp4
215  ret <8 x i16> %tmp5
216}
217
218define <4 x i32> @uaddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
219;CHECK-LABEL: uaddl4s:
220;CHECK: uaddl.4s
221  %tmp1 = load <4 x i16>, <4 x i16>* %A
222  %tmp2 = load <4 x i16>, <4 x i16>* %B
223  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
224  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
225  %tmp5 = add <4 x i32> %tmp3, %tmp4
226  ret <4 x i32> %tmp5
227}
228
229define <2 x i64> @uaddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
230;CHECK-LABEL: uaddl2d:
231;CHECK: uaddl.2d
232  %tmp1 = load <2 x i32>, <2 x i32>* %A
233  %tmp2 = load <2 x i32>, <2 x i32>* %B
234  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
235  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
236  %tmp5 = add <2 x i64> %tmp3, %tmp4
237  ret <2 x i64> %tmp5
238}
239
240
241define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
242; CHECK-LABEL: uaddl2_8h:
243; CHECK-NEXT: uaddl2.8h v0, v0, v1
244; CHECK-NEXT: ret
245  %tmp = bitcast <16 x i8> %a to <2 x i64>
246  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
247  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
248  %vmovl.i.i.i = zext <8 x i8> %tmp1 to <8 x i16>
249  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
250  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
251  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
252  %vmovl.i.i5.i = zext <8 x i8> %tmp3 to <8 x i16>
253  %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
254  ret <8 x i16> %add.i
255}
256
257define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
258; CHECK-LABEL: uaddl2_4s:
259; CHECK-NEXT: uaddl2.4s v0, v0, v1
260; CHECK-NEXT: ret
261  %tmp = bitcast <8 x i16> %a to <2 x i64>
262  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
263  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
264  %vmovl.i.i.i = zext <4 x i16> %tmp1 to <4 x i32>
265  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
266  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
267  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
268  %vmovl.i.i5.i = zext <4 x i16> %tmp3 to <4 x i32>
269  %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
270  ret <4 x i32> %add.i
271}
272
273define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
274; CHECK-LABEL: uaddl2_2d:
275; CHECK-NEXT: uaddl2.2d v0, v0, v1
276; CHECK-NEXT: ret
277  %tmp = bitcast <4 x i32> %a to <2 x i64>
278  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
279  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
280  %vmovl.i.i.i = zext <2 x i32> %tmp1 to <2 x i64>
281  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
282  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
283  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
284  %vmovl.i.i5.i = zext <2 x i32> %tmp3 to <2 x i64>
285  %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
286  ret <2 x i64> %add.i
287}
288
289define <8 x i16> @uaddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
290;CHECK-LABEL: uaddw8h:
291;CHECK: uaddw.8h
292        %tmp1 = load <8 x i16>, <8 x i16>* %A
293        %tmp2 = load <8 x i8>, <8 x i8>* %B
294  %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
295  %tmp4 = add <8 x i16> %tmp1, %tmp3
296        ret <8 x i16> %tmp4
297}
298
299define <4 x i32> @uaddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
300;CHECK-LABEL: uaddw4s:
301;CHECK: uaddw.4s
302        %tmp1 = load <4 x i32>, <4 x i32>* %A
303        %tmp2 = load <4 x i16>, <4 x i16>* %B
304  %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
305  %tmp4 = add <4 x i32> %tmp1, %tmp3
306        ret <4 x i32> %tmp4
307}
308
309define <2 x i64> @uaddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
310;CHECK-LABEL: uaddw2d:
311;CHECK: uaddw.2d
312        %tmp1 = load <2 x i64>, <2 x i64>* %A
313        %tmp2 = load <2 x i32>, <2 x i32>* %B
314  %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
315  %tmp4 = add <2 x i64> %tmp1, %tmp3
316        ret <2 x i64> %tmp4
317}
318
319define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
320;CHECK-LABEL: uaddw2_8h:
321;CHECK: uaddw.8h
322        %tmp1 = load <8 x i16>, <8 x i16>* %A
323
324        %tmp2 = load <16 x i8>, <16 x i8>* %B
325        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
326        %ext2 = zext <8 x i8> %high2 to <8 x i16>
327
328        %res = add <8 x i16> %tmp1, %ext2
329        ret <8 x i16> %res
330}
331
332define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
333;CHECK-LABEL: uaddw2_4s:
334;CHECK: uaddw.4s
335        %tmp1 = load <4 x i32>, <4 x i32>* %A
336
337        %tmp2 = load <8 x i16>, <8 x i16>* %B
338        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
339        %ext2 = zext <4 x i16> %high2 to <4 x i32>
340
341        %res = add <4 x i32> %tmp1, %ext2
342        ret <4 x i32> %res
343}
344
345define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
346;CHECK-LABEL: uaddw2_2d:
347;CHECK: uaddw.2d
348        %tmp1 = load <2 x i64>, <2 x i64>* %A
349
350        %tmp2 = load <4 x i32>, <4 x i32>* %B
351        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
352        %ext2 = zext <2 x i32> %high2 to <2 x i64>
353
354        %res = add <2 x i64> %tmp1, %ext2
355        ret <2 x i64> %res
356}
357
358define <8 x i16> @saddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
359;CHECK-LABEL: saddw8h:
360;CHECK: saddw.8h
361        %tmp1 = load <8 x i16>, <8 x i16>* %A
362        %tmp2 = load <8 x i8>, <8 x i8>* %B
363        %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
364        %tmp4 = add <8 x i16> %tmp1, %tmp3
365        ret <8 x i16> %tmp4
366}
367
368define <4 x i32> @saddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
369;CHECK-LABEL: saddw4s:
370;CHECK: saddw.4s
371        %tmp1 = load <4 x i32>, <4 x i32>* %A
372        %tmp2 = load <4 x i16>, <4 x i16>* %B
373        %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
374        %tmp4 = add <4 x i32> %tmp1, %tmp3
375        ret <4 x i32> %tmp4
376}
377
378define <2 x i64> @saddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
379;CHECK-LABEL: saddw2d:
380;CHECK: saddw.2d
381        %tmp1 = load <2 x i64>, <2 x i64>* %A
382        %tmp2 = load <2 x i32>, <2 x i32>* %B
383        %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
384        %tmp4 = add <2 x i64> %tmp1, %tmp3
385        ret <2 x i64> %tmp4
386}
387
388define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
389;CHECK-LABEL: saddw2_8h:
390;CHECK: saddw.8h
391        %tmp1 = load <8 x i16>, <8 x i16>* %A
392
393        %tmp2 = load <16 x i8>, <16 x i8>* %B
394        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
395        %ext2 = sext <8 x i8> %high2 to <8 x i16>
396
397        %res = add <8 x i16> %tmp1, %ext2
398        ret <8 x i16> %res
399}
400
401define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
402;CHECK-LABEL: saddw2_4s:
403;CHECK: saddw.4s
404        %tmp1 = load <4 x i32>, <4 x i32>* %A
405
406        %tmp2 = load <8 x i16>, <8 x i16>* %B
407        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
408        %ext2 = sext <4 x i16> %high2 to <4 x i32>
409
410        %res = add <4 x i32> %tmp1, %ext2
411        ret <4 x i32> %res
412}
413
414define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
415;CHECK-LABEL: saddw2_2d:
416;CHECK: saddw.2d
417        %tmp1 = load <2 x i64>, <2 x i64>* %A
418
419        %tmp2 = load <4 x i32>, <4 x i32>* %B
420        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
421        %ext2 = sext <2 x i32> %high2 to <2 x i64>
422
423        %res = add <2 x i64> %tmp1, %ext2
424        ret <2 x i64> %res
425}
426
427define <4 x i16> @saddlp4h(<8 x i8>* %A) nounwind {
428;CHECK-LABEL: saddlp4h:
429;CHECK: saddlp.4h
430        %tmp1 = load <8 x i8>, <8 x i8>* %A
431        %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
432        ret <4 x i16> %tmp3
433}
434
435define <2 x i32> @saddlp2s(<4 x i16>* %A) nounwind {
436;CHECK-LABEL: saddlp2s:
437;CHECK: saddlp.2s
438        %tmp1 = load <4 x i16>, <4 x i16>* %A
439        %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
440        ret <2 x i32> %tmp3
441}
442
443define <1 x i64> @saddlp1d(<2 x i32>* %A) nounwind {
444;CHECK-LABEL: saddlp1d:
445;CHECK: saddlp.1d
446        %tmp1 = load <2 x i32>, <2 x i32>* %A
447        %tmp3 = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
448        ret <1 x i64> %tmp3
449}
450
451define <8 x i16> @saddlp8h(<16 x i8>* %A) nounwind {
452;CHECK-LABEL: saddlp8h:
453;CHECK: saddlp.8h
454        %tmp1 = load <16 x i8>, <16 x i8>* %A
455        %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
456        ret <8 x i16> %tmp3
457}
458
459define <4 x i32> @saddlp4s(<8 x i16>* %A) nounwind {
460;CHECK-LABEL: saddlp4s:
461;CHECK: saddlp.4s
462        %tmp1 = load <8 x i16>, <8 x i16>* %A
463        %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
464        ret <4 x i32> %tmp3
465}
466
467define <2 x i64> @saddlp2d(<4 x i32>* %A) nounwind {
468;CHECK-LABEL: saddlp2d:
469;CHECK: saddlp.2d
470        %tmp1 = load <4 x i32>, <4 x i32>* %A
471        %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
472        ret <2 x i64> %tmp3
473}
474
475declare <4 x i16>  @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
476declare <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
477declare <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
478
479declare <8 x i16>  @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
480declare <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
481declare <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
482
483define <4 x i16> @uaddlp4h(<8 x i8>* %A) nounwind {
484;CHECK-LABEL: uaddlp4h:
485;CHECK: uaddlp.4h
486        %tmp1 = load <8 x i8>, <8 x i8>* %A
487        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
488        ret <4 x i16> %tmp3
489}
490
491define <2 x i32> @uaddlp2s(<4 x i16>* %A) nounwind {
492;CHECK-LABEL: uaddlp2s:
493;CHECK: uaddlp.2s
494        %tmp1 = load <4 x i16>, <4 x i16>* %A
495        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
496        ret <2 x i32> %tmp3
497}
498
499define <1 x i64> @uaddlp1d(<2 x i32>* %A) nounwind {
500;CHECK-LABEL: uaddlp1d:
501;CHECK: uaddlp.1d
502        %tmp1 = load <2 x i32>, <2 x i32>* %A
503        %tmp3 = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
504        ret <1 x i64> %tmp3
505}
506
507define <8 x i16> @uaddlp8h(<16 x i8>* %A) nounwind {
508;CHECK-LABEL: uaddlp8h:
509;CHECK: uaddlp.8h
510        %tmp1 = load <16 x i8>, <16 x i8>* %A
511        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
512        ret <8 x i16> %tmp3
513}
514
515define <4 x i32> @uaddlp4s(<8 x i16>* %A) nounwind {
516;CHECK-LABEL: uaddlp4s:
517;CHECK: uaddlp.4s
518        %tmp1 = load <8 x i16>, <8 x i16>* %A
519        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
520        ret <4 x i32> %tmp3
521}
522
523define <2 x i64> @uaddlp2d(<4 x i32>* %A) nounwind {
524;CHECK-LABEL: uaddlp2d:
525;CHECK: uaddlp.2d
526        %tmp1 = load <4 x i32>, <4 x i32>* %A
527        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
528        ret <2 x i64> %tmp3
529}
530
531declare <4 x i16>  @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
532declare <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
533declare <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
534
535declare <8 x i16>  @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
536declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
537declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
538
539define <4 x i16> @sadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
540;CHECK-LABEL: sadalp4h:
541;CHECK: sadalp.4h
542        %tmp1 = load <8 x i8>, <8 x i8>* %A
543        %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
544        %tmp4 = load <4 x i16>, <4 x i16>* %B
545        %tmp5 = add <4 x i16> %tmp3, %tmp4
546        ret <4 x i16> %tmp5
547}
548
549define <2 x i32> @sadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
550;CHECK-LABEL: sadalp2s:
551;CHECK: sadalp.2s
552        %tmp1 = load <4 x i16>, <4 x i16>* %A
553        %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
554        %tmp4 = load <2 x i32>, <2 x i32>* %B
555        %tmp5 = add <2 x i32> %tmp3, %tmp4
556        ret <2 x i32> %tmp5
557}
558
559define <8 x i16> @sadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
560;CHECK-LABEL: sadalp8h:
561;CHECK: sadalp.8h
562        %tmp1 = load <16 x i8>, <16 x i8>* %A
563        %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
564        %tmp4 = load <8 x i16>, <8 x i16>* %B
565        %tmp5 = add <8 x i16> %tmp3, %tmp4
566        ret <8 x i16> %tmp5
567}
568
569define <4 x i32> @sadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
570;CHECK-LABEL: sadalp4s:
571;CHECK: sadalp.4s
572        %tmp1 = load <8 x i16>, <8 x i16>* %A
573        %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
574        %tmp4 = load <4 x i32>, <4 x i32>* %B
575        %tmp5 = add <4 x i32> %tmp3, %tmp4
576        ret <4 x i32> %tmp5
577}
578
579define <2 x i64> @sadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
580;CHECK-LABEL: sadalp2d:
581;CHECK: sadalp.2d
582        %tmp1 = load <4 x i32>, <4 x i32>* %A
583        %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
584        %tmp4 = load <2 x i64>, <2 x i64>* %B
585        %tmp5 = add <2 x i64> %tmp3, %tmp4
586        ret <2 x i64> %tmp5
587}
588
589define <4 x i16> @uadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
590;CHECK-LABEL: uadalp4h:
591;CHECK: uadalp.4h
592        %tmp1 = load <8 x i8>, <8 x i8>* %A
593        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
594        %tmp4 = load <4 x i16>, <4 x i16>* %B
595        %tmp5 = add <4 x i16> %tmp3, %tmp4
596        ret <4 x i16> %tmp5
597}
598
599define <2 x i32> @uadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
600;CHECK-LABEL: uadalp2s:
601;CHECK: uadalp.2s
602        %tmp1 = load <4 x i16>, <4 x i16>* %A
603        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
604        %tmp4 = load <2 x i32>, <2 x i32>* %B
605        %tmp5 = add <2 x i32> %tmp3, %tmp4
606        ret <2 x i32> %tmp5
607}
608
609define <8 x i16> @uadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
610;CHECK-LABEL: uadalp8h:
611;CHECK: uadalp.8h
612        %tmp1 = load <16 x i8>, <16 x i8>* %A
613        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
614        %tmp4 = load <8 x i16>, <8 x i16>* %B
615        %tmp5 = add <8 x i16> %tmp3, %tmp4
616        ret <8 x i16> %tmp5
617}
618
619define <4 x i32> @uadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
620;CHECK-LABEL: uadalp4s:
621;CHECK: uadalp.4s
622        %tmp1 = load <8 x i16>, <8 x i16>* %A
623        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
624        %tmp4 = load <4 x i32>, <4 x i32>* %B
625        %tmp5 = add <4 x i32> %tmp3, %tmp4
626        ret <4 x i32> %tmp5
627}
628
629define <2 x i64> @uadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
630;CHECK-LABEL: uadalp2d:
631;CHECK: uadalp.2d
632        %tmp1 = load <4 x i32>, <4 x i32>* %A
633        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
634        %tmp4 = load <2 x i64>, <2 x i64>* %B
635        %tmp5 = add <2 x i64> %tmp3, %tmp4
636        ret <2 x i64> %tmp5
637}
638
639define <8 x i8> @addp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
640;CHECK-LABEL: addp_8b:
641;CHECK: addp.8b
642        %tmp1 = load <8 x i8>, <8 x i8>* %A
643        %tmp2 = load <8 x i8>, <8 x i8>* %B
644        %tmp3 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
645        ret <8 x i8> %tmp3
646}
647
648define <16 x i8> @addp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
649;CHECK-LABEL: addp_16b:
650;CHECK: addp.16b
651        %tmp1 = load <16 x i8>, <16 x i8>* %A
652        %tmp2 = load <16 x i8>, <16 x i8>* %B
653        %tmp3 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
654        ret <16 x i8> %tmp3
655}
656
657define <4 x i16> @addp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
658;CHECK-LABEL: addp_4h:
659;CHECK: addp.4h
660        %tmp1 = load <4 x i16>, <4 x i16>* %A
661        %tmp2 = load <4 x i16>, <4 x i16>* %B
662        %tmp3 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
663        ret <4 x i16> %tmp3
664}
665
666define <8 x i16> @addp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
667;CHECK-LABEL: addp_8h:
668;CHECK: addp.8h
669        %tmp1 = load <8 x i16>, <8 x i16>* %A
670        %tmp2 = load <8 x i16>, <8 x i16>* %B
671        %tmp3 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
672        ret <8 x i16> %tmp3
673}
674
675define <2 x i32> @addp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
676;CHECK-LABEL: addp_2s:
677;CHECK: addp.2s
678        %tmp1 = load <2 x i32>, <2 x i32>* %A
679        %tmp2 = load <2 x i32>, <2 x i32>* %B
680        %tmp3 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
681        ret <2 x i32> %tmp3
682}
683
684define <4 x i32> @addp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
685;CHECK-LABEL: addp_4s:
686;CHECK: addp.4s
687        %tmp1 = load <4 x i32>, <4 x i32>* %A
688        %tmp2 = load <4 x i32>, <4 x i32>* %B
689        %tmp3 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
690        ret <4 x i32> %tmp3
691}
692
693define <2 x i64> @addp_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
694;CHECK-LABEL: addp_2d:
695;CHECK: addp.2d
696        %tmp1 = load <2 x i64>, <2 x i64>* %A
697        %tmp2 = load <2 x i64>, <2 x i64>* %B
698        %tmp3 = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
699        ret <2 x i64> %tmp3
700}
701
702declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
703declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
704declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
705declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
706declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
707declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
708declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
709
710define <2 x float> @faddp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
711;CHECK-LABEL: faddp_2s:
712;CHECK: faddp.2s
713        %tmp1 = load <2 x float>, <2 x float>* %A
714        %tmp2 = load <2 x float>, <2 x float>* %B
715        %tmp3 = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
716        ret <2 x float> %tmp3
717}
718
719define <4 x float> @faddp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
720;CHECK-LABEL: faddp_4s:
721;CHECK: faddp.4s
722        %tmp1 = load <4 x float>, <4 x float>* %A
723        %tmp2 = load <4 x float>, <4 x float>* %B
724        %tmp3 = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
725        ret <4 x float> %tmp3
726}
727
728define <2 x double> @faddp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
729;CHECK-LABEL: faddp_2d:
730;CHECK: faddp.2d
731        %tmp1 = load <2 x double>, <2 x double>* %A
732        %tmp2 = load <2 x double>, <2 x double>* %B
733        %tmp3 = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
734        ret <2 x double> %tmp3
735}
736
737declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone
738declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
739declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
740
741define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
742; CHECK-LABEL: uaddl2_duprhs
743; CHECK-NOT: ext.16b
744; CHECK: uaddl2.2d
745  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
746  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
747
748  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
749
750  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
751  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
752
753  %res = add <2 x i64> %lhs.ext, %rhs.ext
754  ret <2 x i64> %res
755}
756
757define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
758; CHECK-LABEL: saddl2_duplhs
759; CHECK-NOT: ext.16b
760; CHECK: saddl2.2d
761  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
762  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
763
764  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
765
766  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
767  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
768
769  %res = add <2 x i64> %lhs.ext, %rhs.ext
770  ret <2 x i64> %res
771}
772
773define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
774; CHECK-LABEL: usubl2_duprhs
775; CHECK-NOT: ext.16b
776; CHECK: usubl2.2d
777  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
778  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
779
780  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
781
782  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
783  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
784
785  %res = sub <2 x i64> %lhs.ext, %rhs.ext
786  ret <2 x i64> %res
787}
788
789define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
790; CHECK-LABEL: ssubl2_duplhs
791; CHECK-NOT: ext.16b
792; CHECK: ssubl2.2d
793  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
794  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
795
796  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
797
798  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
799  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
800
801  %res = sub <2 x i64> %lhs.ext, %rhs.ext
802  ret <2 x i64> %res
803}
804
805define <8 x i8> @addhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
806;CHECK-LABEL: addhn8b_natural:
807;CHECK: addhn.8b
808        %tmp1 = load <8 x i16>, <8 x i16>* %A
809        %tmp2 = load <8 x i16>, <8 x i16>* %B
810        %sum = add <8 x i16> %tmp1, %tmp2
811        %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
812        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
813        ret <8 x i8> %narrowed
814}
815
816define <4 x i16> @addhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
817;CHECK-LABEL: addhn4h_natural:
818;CHECK: addhn.4h
819        %tmp1 = load <4 x i32>, <4 x i32>* %A
820        %tmp2 = load <4 x i32>, <4 x i32>* %B
821        %sum = add <4 x i32> %tmp1, %tmp2
822        %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
823        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
824        ret <4 x i16> %narrowed
825}
826
827define <2 x i32> @addhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
828;CHECK-LABEL: addhn2s_natural:
829;CHECK: addhn.2s
830        %tmp1 = load <2 x i64>, <2 x i64>* %A
831        %tmp2 = load <2 x i64>, <2 x i64>* %B
832        %sum = add <2 x i64> %tmp1, %tmp2
833        %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
834        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
835        ret <2 x i32> %narrowed
836}
837
838define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
839;CHECK-LABEL: addhn2_16b_natural:
840;CHECK: addhn2.16b
841        %tmp1 = load <8 x i16>, <8 x i16>* %A
842        %tmp2 = load <8 x i16>, <8 x i16>* %B
843        %sum = add <8 x i16> %tmp1, %tmp2
844        %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
845        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
846        %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
847        ret <16 x i8> %res
848}
849
850define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
851;CHECK-LABEL: addhn2_8h_natural:
852;CHECK: addhn2.8h
853        %tmp1 = load <4 x i32>, <4 x i32>* %A
854        %tmp2 = load <4 x i32>, <4 x i32>* %B
855        %sum = add <4 x i32> %tmp1, %tmp2
856        %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
857        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
858        %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
859        ret <8 x i16> %res
860}
861
862define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
863;CHECK-LABEL: addhn2_4s_natural:
864;CHECK: addhn2.4s
865        %tmp1 = load <2 x i64>, <2 x i64>* %A
866        %tmp2 = load <2 x i64>, <2 x i64>* %B
867        %sum = add <2 x i64> %tmp1, %tmp2
868        %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
869        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
870        %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
871        ret <4 x i32> %res
872}
873
874define <8 x i8> @subhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
875;CHECK-LABEL: subhn8b_natural:
876;CHECK: subhn.8b
877        %tmp1 = load <8 x i16>, <8 x i16>* %A
878        %tmp2 = load <8 x i16>, <8 x i16>* %B
879        %diff = sub <8 x i16> %tmp1, %tmp2
880        %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
881        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
882        ret <8 x i8> %narrowed
883}
884
885define <4 x i16> @subhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
886;CHECK-LABEL: subhn4h_natural:
887;CHECK: subhn.4h
888        %tmp1 = load <4 x i32>, <4 x i32>* %A
889        %tmp2 = load <4 x i32>, <4 x i32>* %B
890        %diff = sub <4 x i32> %tmp1, %tmp2
891        %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
892        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
893        ret <4 x i16> %narrowed
894}
895
896define <2 x i32> @subhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
897;CHECK-LABEL: subhn2s_natural:
898;CHECK: subhn.2s
899        %tmp1 = load <2 x i64>, <2 x i64>* %A
900        %tmp2 = load <2 x i64>, <2 x i64>* %B
901        %diff = sub <2 x i64> %tmp1, %tmp2
902        %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
903        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
904        ret <2 x i32> %narrowed
905}
906
907define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
908;CHECK-LABEL: subhn2_16b_natural:
909;CHECK: subhn2.16b
910        %tmp1 = load <8 x i16>, <8 x i16>* %A
911        %tmp2 = load <8 x i16>, <8 x i16>* %B
912        %diff = sub <8 x i16> %tmp1, %tmp2
913        %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
914        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
915        %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
916        ret <16 x i8> %res
917}
918
919define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
920;CHECK-LABEL: subhn2_8h_natural:
921;CHECK: subhn2.8h
922        %tmp1 = load <4 x i32>, <4 x i32>* %A
923        %tmp2 = load <4 x i32>, <4 x i32>* %B
924        %diff = sub <4 x i32> %tmp1, %tmp2
925        %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
926        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
927        %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
928        ret <8 x i16> %res
929}
930
931define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
932;CHECK-LABEL: subhn2_4s_natural:
933;CHECK: subhn2.4s
934        %tmp1 = load <2 x i64>, <2 x i64>* %A
935        %tmp2 = load <2 x i64>, <2 x i64>* %B
936        %diff = sub <2 x i64> %tmp1, %tmp2
937        %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
938        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
939        %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
940        ret <4 x i32> %res
941}
942