1; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
2
3
4define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
5;CHECK-LABEL: sabdl8h:
6;CHECK: sabdl.8h
7        %tmp1 = load <8 x i8>, <8 x i8>* %A
8        %tmp2 = load <8 x i8>, <8 x i8>* %B
9        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
10        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
11        ret <8 x i16> %tmp4
12}
13
14define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
15;CHECK-LABEL: sabdl4s:
16;CHECK: sabdl.4s
17        %tmp1 = load <4 x i16>, <4 x i16>* %A
18        %tmp2 = load <4 x i16>, <4 x i16>* %B
19        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
20        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
21        ret <4 x i32> %tmp4
22}
23
24define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
25;CHECK-LABEL: sabdl2d:
26;CHECK: sabdl.2d
27        %tmp1 = load <2 x i32>, <2 x i32>* %A
28        %tmp2 = load <2 x i32>, <2 x i32>* %B
29        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
30        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
31        ret <2 x i64> %tmp4
32}
33
34define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
35;CHECK-LABEL: sabdl2_8h:
36;CHECK: sabdl2.8h
37        %load1 = load <16 x i8>, <16 x i8>* %A
38        %load2 = load <16 x i8>, <16 x i8>* %B
39        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
40        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
41        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
42        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
43        ret <8 x i16> %tmp4
44}
45
46define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
47;CHECK-LABEL: sabdl2_4s:
48;CHECK: sabdl2.4s
49        %load1 = load <8 x i16>, <8 x i16>* %A
50        %load2 = load <8 x i16>, <8 x i16>* %B
51        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
52        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
53        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
54        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
55        ret <4 x i32> %tmp4
56}
57
58define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
59;CHECK-LABEL: sabdl2_2d:
60;CHECK: sabdl2.2d
61        %load1 = load <4 x i32>, <4 x i32>* %A
62        %load2 = load <4 x i32>, <4 x i32>* %B
63        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
64        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
65        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
66        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
67        ret <2 x i64> %tmp4
68}
69
70define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
71;CHECK-LABEL: uabdl8h:
72;CHECK: uabdl.8h
73  %tmp1 = load <8 x i8>, <8 x i8>* %A
74  %tmp2 = load <8 x i8>, <8 x i8>* %B
75  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
76  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
77  ret <8 x i16> %tmp4
78}
79
80define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
81;CHECK-LABEL: uabdl4s:
82;CHECK: uabdl.4s
83  %tmp1 = load <4 x i16>, <4 x i16>* %A
84  %tmp2 = load <4 x i16>, <4 x i16>* %B
85  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
86  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
87  ret <4 x i32> %tmp4
88}
89
90define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
91;CHECK-LABEL: uabdl2d:
92;CHECK: uabdl.2d
93  %tmp1 = load <2 x i32>, <2 x i32>* %A
94  %tmp2 = load <2 x i32>, <2 x i32>* %B
95  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
96  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
97  ret <2 x i64> %tmp4
98}
99
100define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
101;CHECK-LABEL: uabdl2_8h:
102;CHECK: uabdl2.8h
103  %load1 = load <16 x i8>, <16 x i8>* %A
104  %load2 = load <16 x i8>, <16 x i8>* %B
105  %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
106  %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
107
108  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
109  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
110  ret <8 x i16> %tmp4
111}
112
113define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
114;CHECK-LABEL: uabdl2_4s:
115;CHECK: uabdl2.4s
116  %load1 = load <8 x i16>, <8 x i16>* %A
117  %load2 = load <8 x i16>, <8 x i16>* %B
118  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
119  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
120  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
121  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
122  ret <4 x i32> %tmp4
123}
124
125define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
126;CHECK-LABEL: uabdl2_2d:
127;CHECK: uabdl2.2d
128  %load1 = load <4 x i32>, <4 x i32>* %A
129  %load2 = load <4 x i32>, <4 x i32>* %B
130  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
131  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
132  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
133  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
134  ret <2 x i64> %tmp4
135}
136
137define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
138;CHECK-LABEL: fabd_2s:
139;CHECK: fabd.2s
140        %tmp1 = load <2 x float>, <2 x float>* %A
141        %tmp2 = load <2 x float>, <2 x float>* %B
142        %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
143        ret <2 x float> %tmp3
144}
145
146define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
147;CHECK-LABEL: fabd_4s:
148;CHECK: fabd.4s
149        %tmp1 = load <4 x float>, <4 x float>* %A
150        %tmp2 = load <4 x float>, <4 x float>* %B
151        %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
152        ret <4 x float> %tmp3
153}
154
155define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
156;CHECK-LABEL: fabd_2d:
157;CHECK: fabd.2d
158        %tmp1 = load <2 x double>, <2 x double>* %A
159        %tmp2 = load <2 x double>, <2 x double>* %B
160        %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
161        ret <2 x double> %tmp3
162}
163
164declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
165declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
166declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
167
168define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
169;CHECK-LABEL: sabd_8b:
170;CHECK: sabd.8b
171        %tmp1 = load <8 x i8>, <8 x i8>* %A
172        %tmp2 = load <8 x i8>, <8 x i8>* %B
173        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
174        ret <8 x i8> %tmp3
175}
176
177define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
178;CHECK-LABEL: sabd_16b:
179;CHECK: sabd.16b
180        %tmp1 = load <16 x i8>, <16 x i8>* %A
181        %tmp2 = load <16 x i8>, <16 x i8>* %B
182        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
183        ret <16 x i8> %tmp3
184}
185
186define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
187;CHECK-LABEL: sabd_4h:
188;CHECK: sabd.4h
189        %tmp1 = load <4 x i16>, <4 x i16>* %A
190        %tmp2 = load <4 x i16>, <4 x i16>* %B
191        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
192        ret <4 x i16> %tmp3
193}
194
195define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
196;CHECK-LABEL: sabd_8h:
197;CHECK: sabd.8h
198        %tmp1 = load <8 x i16>, <8 x i16>* %A
199        %tmp2 = load <8 x i16>, <8 x i16>* %B
200        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
201        ret <8 x i16> %tmp3
202}
203
204define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
205;CHECK-LABEL: sabd_2s:
206;CHECK: sabd.2s
207        %tmp1 = load <2 x i32>, <2 x i32>* %A
208        %tmp2 = load <2 x i32>, <2 x i32>* %B
209        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
210        ret <2 x i32> %tmp3
211}
212
213define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
214;CHECK-LABEL: sabd_4s:
215;CHECK: sabd.4s
216        %tmp1 = load <4 x i32>, <4 x i32>* %A
217        %tmp2 = load <4 x i32>, <4 x i32>* %B
218        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
219        ret <4 x i32> %tmp3
220}
221
222declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
223declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
224declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
225declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
226declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
227declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
228
229define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
230;CHECK-LABEL: uabd_8b:
231;CHECK: uabd.8b
232        %tmp1 = load <8 x i8>, <8 x i8>* %A
233        %tmp2 = load <8 x i8>, <8 x i8>* %B
234        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
235        ret <8 x i8> %tmp3
236}
237
238define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
239;CHECK-LABEL: uabd_16b:
240;CHECK: uabd.16b
241        %tmp1 = load <16 x i8>, <16 x i8>* %A
242        %tmp2 = load <16 x i8>, <16 x i8>* %B
243        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
244        ret <16 x i8> %tmp3
245}
246
247define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
248;CHECK-LABEL: uabd_4h:
249;CHECK: uabd.4h
250        %tmp1 = load <4 x i16>, <4 x i16>* %A
251        %tmp2 = load <4 x i16>, <4 x i16>* %B
252        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
253        ret <4 x i16> %tmp3
254}
255
256define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
257;CHECK-LABEL: uabd_8h:
258;CHECK: uabd.8h
259        %tmp1 = load <8 x i16>, <8 x i16>* %A
260        %tmp2 = load <8 x i16>, <8 x i16>* %B
261        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
262        ret <8 x i16> %tmp3
263}
264
265define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
266;CHECK-LABEL: uabd_2s:
267;CHECK: uabd.2s
268        %tmp1 = load <2 x i32>, <2 x i32>* %A
269        %tmp2 = load <2 x i32>, <2 x i32>* %B
270        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
271        ret <2 x i32> %tmp3
272}
273
274define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
275;CHECK-LABEL: uabd_4s:
276;CHECK: uabd.4s
277        %tmp1 = load <4 x i32>, <4 x i32>* %A
278        %tmp2 = load <4 x i32>, <4 x i32>* %B
279        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
280        ret <4 x i32> %tmp3
281}
282
283declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
284declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
285declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
286declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
287declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
288declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
289
290define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
291;CHECK-LABEL: sqabs_8b:
292;CHECK: sqabs.8b
293        %tmp1 = load <8 x i8>, <8 x i8>* %A
294        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1)
295        ret <8 x i8> %tmp3
296}
297
298define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
299;CHECK-LABEL: sqabs_16b:
300;CHECK: sqabs.16b
301        %tmp1 = load <16 x i8>, <16 x i8>* %A
302        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1)
303        ret <16 x i8> %tmp3
304}
305
306define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
307;CHECK-LABEL: sqabs_4h:
308;CHECK: sqabs.4h
309        %tmp1 = load <4 x i16>, <4 x i16>* %A
310        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1)
311        ret <4 x i16> %tmp3
312}
313
314define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
315;CHECK-LABEL: sqabs_8h:
316;CHECK: sqabs.8h
317        %tmp1 = load <8 x i16>, <8 x i16>* %A
318        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1)
319        ret <8 x i16> %tmp3
320}
321
322define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
323;CHECK-LABEL: sqabs_2s:
324;CHECK: sqabs.2s
325        %tmp1 = load <2 x i32>, <2 x i32>* %A
326        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1)
327        ret <2 x i32> %tmp3
328}
329
330define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
331;CHECK-LABEL: sqabs_4s:
332;CHECK: sqabs.4s
333        %tmp1 = load <4 x i32>, <4 x i32>* %A
334        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1)
335        ret <4 x i32> %tmp3
336}
337
338declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
339declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
340declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
341declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
342declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
343declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
344
345define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
346;CHECK-LABEL: sqneg_8b:
347;CHECK: sqneg.8b
348        %tmp1 = load <8 x i8>, <8 x i8>* %A
349        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1)
350        ret <8 x i8> %tmp3
351}
352
353define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
354;CHECK-LABEL: sqneg_16b:
355;CHECK: sqneg.16b
356        %tmp1 = load <16 x i8>, <16 x i8>* %A
357        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1)
358        ret <16 x i8> %tmp3
359}
360
361define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
362;CHECK-LABEL: sqneg_4h:
363;CHECK: sqneg.4h
364        %tmp1 = load <4 x i16>, <4 x i16>* %A
365        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1)
366        ret <4 x i16> %tmp3
367}
368
369define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
370;CHECK-LABEL: sqneg_8h:
371;CHECK: sqneg.8h
372        %tmp1 = load <8 x i16>, <8 x i16>* %A
373        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1)
374        ret <8 x i16> %tmp3
375}
376
377define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
378;CHECK-LABEL: sqneg_2s:
379;CHECK: sqneg.2s
380        %tmp1 = load <2 x i32>, <2 x i32>* %A
381        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1)
382        ret <2 x i32> %tmp3
383}
384
385define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
386;CHECK-LABEL: sqneg_4s:
387;CHECK: sqneg.4s
388        %tmp1 = load <4 x i32>, <4 x i32>* %A
389        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1)
390        ret <4 x i32> %tmp3
391}
392
393declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
394declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
395declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
396declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
397declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
398declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
399
400define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
401;CHECK-LABEL: abs_8b:
402;CHECK: abs.8b
403        %tmp1 = load <8 x i8>, <8 x i8>* %A
404        %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1)
405        ret <8 x i8> %tmp3
406}
407
408define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
409;CHECK-LABEL: abs_16b:
410;CHECK: abs.16b
411        %tmp1 = load <16 x i8>, <16 x i8>* %A
412        %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1)
413        ret <16 x i8> %tmp3
414}
415
416define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
417;CHECK-LABEL: abs_4h:
418;CHECK: abs.4h
419        %tmp1 = load <4 x i16>, <4 x i16>* %A
420        %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1)
421        ret <4 x i16> %tmp3
422}
423
424define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
425;CHECK-LABEL: abs_8h:
426;CHECK: abs.8h
427        %tmp1 = load <8 x i16>, <8 x i16>* %A
428        %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1)
429        ret <8 x i16> %tmp3
430}
431
432define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
433;CHECK-LABEL: abs_2s:
434;CHECK: abs.2s
435        %tmp1 = load <2 x i32>, <2 x i32>* %A
436        %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1)
437        ret <2 x i32> %tmp3
438}
439
440define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
441;CHECK-LABEL: abs_4s:
442;CHECK: abs.4s
443        %tmp1 = load <4 x i32>, <4 x i32>* %A
444        %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1)
445        ret <4 x i32> %tmp3
446}
447
448define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
449; CHECK-LABEL: abs_1d:
450; CHECK: abs d0, d0
451  %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A)
452  ret <1 x i64> %abs
453}
454
455define i64 @abs_1d_honestly(i64 %A) nounwind {
456; CHECK-LABEL: abs_1d_honestly:
457; CHECK: abs d0, d0
458  %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A)
459  ret i64 %abs
460}
461
462declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone
463declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone
464declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone
465declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone
466declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone
467declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone
468declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
469declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
470
471define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
472;CHECK-LABEL: sabal8h:
473;CHECK: sabal.8h
474        %tmp1 = load <8 x i8>, <8 x i8>* %A
475        %tmp2 = load <8 x i8>, <8 x i8>* %B
476        %tmp3 = load <8 x i16>, <8 x i16>* %C
477        %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
478        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
479        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
480        ret <8 x i16> %tmp5
481}
482
483define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
484;CHECK-LABEL: sabal4s:
485;CHECK: sabal.4s
486        %tmp1 = load <4 x i16>, <4 x i16>* %A
487        %tmp2 = load <4 x i16>, <4 x i16>* %B
488        %tmp3 = load <4 x i32>, <4 x i32>* %C
489        %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
490        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
491        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
492        ret <4 x i32> %tmp5
493}
494
495define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
496;CHECK-LABEL: sabal2d:
497;CHECK: sabal.2d
498        %tmp1 = load <2 x i32>, <2 x i32>* %A
499        %tmp2 = load <2 x i32>, <2 x i32>* %B
500        %tmp3 = load <2 x i64>, <2 x i64>* %C
501        %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
502        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
503        %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
504        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
505        ret <2 x i64> %tmp5
506}
507
508define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
509;CHECK-LABEL: sabal2_8h:
510;CHECK: sabal2.8h
511        %load1 = load <16 x i8>, <16 x i8>* %A
512        %load2 = load <16 x i8>, <16 x i8>* %B
513        %tmp3 = load <8 x i16>, <8 x i16>* %C
514        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
515        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
516        %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
517        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
518        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
519        ret <8 x i16> %tmp5
520}
521
522define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
523;CHECK-LABEL: sabal2_4s:
524;CHECK: sabal2.4s
525        %load1 = load <8 x i16>, <8 x i16>* %A
526        %load2 = load <8 x i16>, <8 x i16>* %B
527        %tmp3 = load <4 x i32>, <4 x i32>* %C
528        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
529        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
530        %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
531        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
532        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
533        ret <4 x i32> %tmp5
534}
535
536define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
537;CHECK-LABEL: sabal2_2d:
538;CHECK: sabal2.2d
539        %load1 = load <4 x i32>, <4 x i32>* %A
540        %load2 = load <4 x i32>, <4 x i32>* %B
541        %tmp3 = load <2 x i64>, <2 x i64>* %C
542        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
543        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
544        %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
545        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
546        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
547        ret <2 x i64> %tmp5
548}
549
550define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
551;CHECK-LABEL: uabal8h:
552;CHECK: uabal.8h
553        %tmp1 = load <8 x i8>, <8 x i8>* %A
554        %tmp2 = load <8 x i8>, <8 x i8>* %B
555        %tmp3 = load <8 x i16>, <8 x i16>* %C
556        %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
557        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
558        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
559        ret <8 x i16> %tmp5
560}
561
562define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
563;CHECK-LABEL: uabal4s:
564;CHECK: uabal.4s
565        %tmp1 = load <4 x i16>, <4 x i16>* %A
566        %tmp2 = load <4 x i16>, <4 x i16>* %B
567        %tmp3 = load <4 x i32>, <4 x i32>* %C
568        %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
569        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
570        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
571        ret <4 x i32> %tmp5
572}
573
574define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
575;CHECK-LABEL: uabal2d:
576;CHECK: uabal.2d
577        %tmp1 = load <2 x i32>, <2 x i32>* %A
578        %tmp2 = load <2 x i32>, <2 x i32>* %B
579        %tmp3 = load <2 x i64>, <2 x i64>* %C
580        %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
581        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
582        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
583        ret <2 x i64> %tmp5
584}
585
586define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
587;CHECK-LABEL: uabal2_8h:
588;CHECK: uabal2.8h
589        %load1 = load <16 x i8>, <16 x i8>* %A
590        %load2 = load <16 x i8>, <16 x i8>* %B
591        %tmp3 = load <8 x i16>, <8 x i16>* %C
592        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
593        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
594        %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
595        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
596        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
597        ret <8 x i16> %tmp5
598}
599
600define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
601;CHECK-LABEL: uabal2_4s:
602;CHECK: uabal2.4s
603        %load1 = load <8 x i16>, <8 x i16>* %A
604        %load2 = load <8 x i16>, <8 x i16>* %B
605        %tmp3 = load <4 x i32>, <4 x i32>* %C
606        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
607        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
608        %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
609        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
610        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
611        ret <4 x i32> %tmp5
612}
613
614define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
615;CHECK-LABEL: uabal2_2d:
616;CHECK: uabal2.2d
617        %load1 = load <4 x i32>, <4 x i32>* %A
618        %load2 = load <4 x i32>, <4 x i32>* %B
619        %tmp3 = load <2 x i64>, <2 x i64>* %C
620        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
621        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
622        %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
623        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
624        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
625        ret <2 x i64> %tmp5
626}
627
628define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
629;CHECK-LABEL: saba_8b:
630;CHECK: saba.8b
631        %tmp1 = load <8 x i8>, <8 x i8>* %A
632        %tmp2 = load <8 x i8>, <8 x i8>* %B
633        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
634        %tmp4 = load <8 x i8>, <8 x i8>* %C
635        %tmp5 = add <8 x i8> %tmp3, %tmp4
636        ret <8 x i8> %tmp5
637}
638
639define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
640;CHECK-LABEL: saba_16b:
641;CHECK: saba.16b
642        %tmp1 = load <16 x i8>, <16 x i8>* %A
643        %tmp2 = load <16 x i8>, <16 x i8>* %B
644        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
645        %tmp4 = load <16 x i8>, <16 x i8>* %C
646        %tmp5 = add <16 x i8> %tmp3, %tmp4
647        ret <16 x i8> %tmp5
648}
649
650define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
651;CHECK-LABEL: saba_4h:
652;CHECK: saba.4h
653        %tmp1 = load <4 x i16>, <4 x i16>* %A
654        %tmp2 = load <4 x i16>, <4 x i16>* %B
655        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
656        %tmp4 = load <4 x i16>, <4 x i16>* %C
657        %tmp5 = add <4 x i16> %tmp3, %tmp4
658        ret <4 x i16> %tmp5
659}
660
661define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
662;CHECK-LABEL: saba_8h:
663;CHECK: saba.8h
664        %tmp1 = load <8 x i16>, <8 x i16>* %A
665        %tmp2 = load <8 x i16>, <8 x i16>* %B
666        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
667        %tmp4 = load <8 x i16>, <8 x i16>* %C
668        %tmp5 = add <8 x i16> %tmp3, %tmp4
669        ret <8 x i16> %tmp5
670}
671
672define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
673;CHECK-LABEL: saba_2s:
674;CHECK: saba.2s
675        %tmp1 = load <2 x i32>, <2 x i32>* %A
676        %tmp2 = load <2 x i32>, <2 x i32>* %B
677        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
678        %tmp4 = load <2 x i32>, <2 x i32>* %C
679        %tmp5 = add <2 x i32> %tmp3, %tmp4
680        ret <2 x i32> %tmp5
681}
682
683define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
684;CHECK-LABEL: saba_4s:
685;CHECK: saba.4s
686        %tmp1 = load <4 x i32>, <4 x i32>* %A
687        %tmp2 = load <4 x i32>, <4 x i32>* %B
688        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
689        %tmp4 = load <4 x i32>, <4 x i32>* %C
690        %tmp5 = add <4 x i32> %tmp3, %tmp4
691        ret <4 x i32> %tmp5
692}
693
694define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
695;CHECK-LABEL: uaba_8b:
696;CHECK: uaba.8b
697        %tmp1 = load <8 x i8>, <8 x i8>* %A
698        %tmp2 = load <8 x i8>, <8 x i8>* %B
699        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
700        %tmp4 = load <8 x i8>, <8 x i8>* %C
701        %tmp5 = add <8 x i8> %tmp3, %tmp4
702        ret <8 x i8> %tmp5
703}
704
705define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
706;CHECK-LABEL: uaba_16b:
707;CHECK: uaba.16b
708        %tmp1 = load <16 x i8>, <16 x i8>* %A
709        %tmp2 = load <16 x i8>, <16 x i8>* %B
710        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
711        %tmp4 = load <16 x i8>, <16 x i8>* %C
712        %tmp5 = add <16 x i8> %tmp3, %tmp4
713        ret <16 x i8> %tmp5
714}
715
716define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
717;CHECK-LABEL: uaba_4h:
718;CHECK: uaba.4h
719        %tmp1 = load <4 x i16>, <4 x i16>* %A
720        %tmp2 = load <4 x i16>, <4 x i16>* %B
721        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
722        %tmp4 = load <4 x i16>, <4 x i16>* %C
723        %tmp5 = add <4 x i16> %tmp3, %tmp4
724        ret <4 x i16> %tmp5
725}
726
727define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
728;CHECK-LABEL: uaba_8h:
729;CHECK: uaba.8h
730        %tmp1 = load <8 x i16>, <8 x i16>* %A
731        %tmp2 = load <8 x i16>, <8 x i16>* %B
732        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
733        %tmp4 = load <8 x i16>, <8 x i16>* %C
734        %tmp5 = add <8 x i16> %tmp3, %tmp4
735        ret <8 x i16> %tmp5
736}
737
738define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
739;CHECK-LABEL: uaba_2s:
740;CHECK: uaba.2s
741        %tmp1 = load <2 x i32>, <2 x i32>* %A
742        %tmp2 = load <2 x i32>, <2 x i32>* %B
743        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
744        %tmp4 = load <2 x i32>, <2 x i32>* %C
745        %tmp5 = add <2 x i32> %tmp3, %tmp4
746        ret <2 x i32> %tmp5
747}
748
749define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
750;CHECK-LABEL: uaba_4s:
751;CHECK: uaba.4s
752        %tmp1 = load <4 x i32>, <4 x i32>* %A
753        %tmp2 = load <4 x i32>, <4 x i32>* %B
754        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
755        %tmp4 = load <4 x i32>, <4 x i32>* %C
756        %tmp5 = add <4 x i32> %tmp3, %tmp4
757        ret <4 x i32> %tmp5
758}
759
760; Scalar FABD
761define float @fabds(float %a, float %b) nounwind {
762; CHECK-LABEL: fabds:
763; CHECK: fabd s0, s0, s1
764  %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind
765  ret float %vabd.i
766}
767
768define double @fabdd(double %a, double %b) nounwind {
769; CHECK-LABEL: fabdd:
770; CHECK: fabd d0, d0, d1
771  %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind
772  ret double %vabd.i
773}
774
775declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone
776declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone
777
778define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
779; CHECK-LABEL: uabdl_from_extract_dup:
780; CHECK-NOT: ext.16b
781; CHECK: uabdl2.2d
782  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
783  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
784
785  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
786
787  %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
788  %res1 = zext <2 x i32> %res to <2 x i64>
789  ret <2 x i64> %res1
790}
791
792define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
793; CHECK-LABEL: sabdl_from_extract_dup:
794; CHECK-NOT: ext.16b
795; CHECK: sabdl2.2d
796  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
797  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
798
799  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
800
801  %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
802  %res1 = zext <2 x i32> %res to <2 x i64>
803  ret <2 x i64> %res1
804}
805
806define <2 x i32> @abspattern1(<2 x i32> %a) nounwind {
807; CHECK-LABEL: abspattern1:
808; CHECK: abs.2s
809; CHECK-NEXT: ret
810        %tmp1neg = sub <2 x i32> zeroinitializer, %a
811        %b = icmp sge <2 x i32> %a, zeroinitializer
812        %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg
813        ret <2 x i32> %abs
814}
815
816define <4 x i16> @abspattern2(<4 x i16> %a) nounwind {
817; CHECK-LABEL: abspattern2:
818; CHECK: abs.4h
819; CHECK-NEXT: ret
820        %tmp1neg = sub <4 x i16> zeroinitializer, %a
821        %b = icmp sgt <4 x i16> %a, zeroinitializer
822        %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg
823        ret <4 x i16> %abs
824}
825
826define <8 x i8> @abspattern3(<8 x i8> %a) nounwind {
827; CHECK-LABEL: abspattern3:
828; CHECK: abs.8b
829; CHECK-NEXT: ret
830        %tmp1neg = sub <8 x i8> zeroinitializer, %a
831        %b = icmp slt <8 x i8> %a, zeroinitializer
832        %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a
833        ret <8 x i8> %abs
834}
835
836define <4 x i32> @abspattern4(<4 x i32> %a) nounwind {
837; CHECK-LABEL: abspattern4:
838; CHECK: abs.4s
839; CHECK-NEXT: ret
840        %tmp1neg = sub <4 x i32> zeroinitializer, %a
841        %b = icmp sge <4 x i32> %a, zeroinitializer
842        %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
843        ret <4 x i32> %abs
844}
845
846define <8 x i16> @abspattern5(<8 x i16> %a) nounwind {
847; CHECK-LABEL: abspattern5:
848; CHECK: abs.8h
849; CHECK-NEXT: ret
850        %tmp1neg = sub <8 x i16> zeroinitializer, %a
851        %b = icmp sgt <8 x i16> %a, zeroinitializer
852        %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
853        ret <8 x i16> %abs
854}
855
856define <16 x i8> @abspattern6(<16 x i8> %a) nounwind {
857; CHECK-LABEL: abspattern6:
858; CHECK: abs.16b
859; CHECK-NEXT: ret
860        %tmp1neg = sub <16 x i8> zeroinitializer, %a
861        %b = icmp slt <16 x i8> %a, zeroinitializer
862        %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
863        ret <16 x i8> %abs
864}
865
866define <2 x i64> @abspattern7(<2 x i64> %a) nounwind {
867; CHECK-LABEL: abspattern7:
868; CHECK: abs.2d
869; CHECK-NEXT: ret
870        %tmp1neg = sub <2 x i64> zeroinitializer, %a
871        %b = icmp sle <2 x i64> %a, zeroinitializer
872        %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a
873        ret <2 x i64> %abs
874}
875