1; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck -check-prefixes=CHECK,DAG %s
2; RUN: llc < %s -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* -mtriple=arm64-eabi -aarch64-neon-syntax=apple 2>&1 | FileCheck %s --check-prefixes=FALLBACK,CHECK,GISEL
3
4; FALLBACK-NOT: remark:{{.*}} G_ZEXT
5; FALLBACK-NOT: remark:{{.*}} sabdl8h
6define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
7;CHECK-LABEL: sabdl8h:
8;CHECK: sabdl.8h
9        %tmp1 = load <8 x i8>, <8 x i8>* %A
10        %tmp2 = load <8 x i8>, <8 x i8>* %B
11        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
12        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
13        ret <8 x i16> %tmp4
14}
15
16; FALLBACK-NOT: remark:{{.*}} sabdl4s
17define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
18;CHECK-LABEL: sabdl4s:
19;CHECK: sabdl.4s
20        %tmp1 = load <4 x i16>, <4 x i16>* %A
21        %tmp2 = load <4 x i16>, <4 x i16>* %B
22        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
23        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
24        ret <4 x i32> %tmp4
25}
26
27; FALLBACK-NOT: remark:{{.*}} sabdl2d
28define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
29;CHECK-LABEL: sabdl2d:
30;CHECK: sabdl.2d
31        %tmp1 = load <2 x i32>, <2 x i32>* %A
32        %tmp2 = load <2 x i32>, <2 x i32>* %B
33        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
34        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
35        ret <2 x i64> %tmp4
36}
37
38define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
39;CHECK-LABEL: sabdl2_8h:
40;CHECK: sabdl.8h
41        %load1 = load <16 x i8>, <16 x i8>* %A
42        %load2 = load <16 x i8>, <16 x i8>* %B
43        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
44        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
45        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
46        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
47        ret <8 x i16> %tmp4
48}
49
50define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
51;CHECK-LABEL: sabdl2_4s:
52;CHECK: sabdl.4s
53        %load1 = load <8 x i16>, <8 x i16>* %A
54        %load2 = load <8 x i16>, <8 x i16>* %B
55        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
56        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
57        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
58        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
59        ret <4 x i32> %tmp4
60}
61
62define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
63;CHECK-LABEL: sabdl2_2d:
64;CHECK: sabdl.2d
65        %load1 = load <4 x i32>, <4 x i32>* %A
66        %load2 = load <4 x i32>, <4 x i32>* %B
67        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
68        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
69        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
70        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
71        ret <2 x i64> %tmp4
72}
73
74; FALLBACK-NOT: remark:{{.*}} uabdl8h)
75define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
76;CHECK-LABEL: uabdl8h:
77;CHECK: uabdl.8h
78  %tmp1 = load <8 x i8>, <8 x i8>* %A
79  %tmp2 = load <8 x i8>, <8 x i8>* %B
80  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
81  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
82  ret <8 x i16> %tmp4
83}
84
85; FALLBACK-NOT: remark:{{.*}} uabdl4s)
86define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
87;CHECK-LABEL: uabdl4s:
88;CHECK: uabdl.4s
89  %tmp1 = load <4 x i16>, <4 x i16>* %A
90  %tmp2 = load <4 x i16>, <4 x i16>* %B
91  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
92  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
93  ret <4 x i32> %tmp4
94}
95
96; FALLBACK-NOT: remark:{{.*}} uabdl2d)
97define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
98;CHECK-LABEL: uabdl2d:
99;CHECK: uabdl.2d
100  %tmp1 = load <2 x i32>, <2 x i32>* %A
101  %tmp2 = load <2 x i32>, <2 x i32>* %B
102  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
103  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
104  ret <2 x i64> %tmp4
105}
106
107define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
108;CHECK-LABEL: uabdl2_8h:
109;CHECK: uabdl.8h
110  %load1 = load <16 x i8>, <16 x i8>* %A
111  %load2 = load <16 x i8>, <16 x i8>* %B
112  %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
113  %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
114
115  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
116  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
117  ret <8 x i16> %tmp4
118}
119
120define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
121;CHECK-LABEL: uabdl2_4s:
122;CHECK: uabdl.4s
123  %load1 = load <8 x i16>, <8 x i16>* %A
124  %load2 = load <8 x i16>, <8 x i16>* %B
125  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
126  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
127  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
128  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
129  ret <4 x i32> %tmp4
130}
131
132define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
133;CHECK-LABEL: uabdl2_2d:
134;CHECK: uabdl.2d
135  %load1 = load <4 x i32>, <4 x i32>* %A
136  %load2 = load <4 x i32>, <4 x i32>* %B
137  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
138  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
139  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
140  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
141  ret <2 x i64> %tmp4
142}
143
144declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
145declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
146
147define i16 @uabd16b_rdx(<16 x i8>* %a, <16 x i8>* %b) {
148; CHECK-LABEL: uabd16b_rdx
149; CHECK: uabd.16b
150  %aload = load <16 x i8>, <16 x i8>* %a, align 1
151  %bload = load <16 x i8>, <16 x i8>* %b, align 1
152  %aext = zext <16 x i8> %aload to <16 x i16>
153  %bext = zext <16 x i8> %bload to <16 x i16>
154  %abdiff = sub nsw <16 x i16> %aext, %bext
155  %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer
156  %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff
157  %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff
158  %reduced_v = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %absel)
159  ret i16 %reduced_v
160}
161
162define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
163; CHECK-LABEL: uabd16b_rdx_i32
164; CHECK: uabd.16b
165  %aext = zext <16 x i8> %a to <16 x i32>
166  %bext = zext <16 x i8> %b to <16 x i32>
167  %abdiff = sub nsw <16 x i32> %aext, %bext
168  %abcmp = icmp slt <16 x i32> %abdiff, zeroinitializer
169  %ababs = sub nsw <16 x i32> zeroinitializer, %abdiff
170  %absel = select <16 x i1> %abcmp, <16 x i32> %ababs, <16 x i32> %abdiff
171  %reduced_v = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %absel)
172  ret i32 %reduced_v
173}
174
175define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
176; CHECK-LABEL: sabd16b_rdx_i32
177; CHECK: sabd.16b
178  %aext = sext <16 x i8> %a to <16 x i32>
179  %bext = sext <16 x i8> %b to <16 x i32>
180  %abdiff = sub nsw <16 x i32> %aext, %bext
181  %abcmp = icmp slt <16 x i32> %abdiff, zeroinitializer
182  %ababs = sub nsw <16 x i32> zeroinitializer, %abdiff
183  %absel = select <16 x i1> %abcmp, <16 x i32> %ababs, <16 x i32> %abdiff
184  %reduced_v = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %absel)
185  ret i32 %reduced_v
186}
187
188
189declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
190declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
191
192define i32 @uabd8h_rdx(<8 x i16>* %a, <8 x i16>* %b) {
193; CHECK-LABEL: uabd8h_rdx
194; CHECK: uabd.8h
195  %aload = load <8 x i16>, <8 x i16>* %a, align 1
196  %bload = load <8 x i16>, <8 x i16>* %b, align 1
197  %aext = zext <8 x i16> %aload to <8 x i32>
198  %bext = zext <8 x i16> %bload to <8 x i32>
199  %abdiff = sub nsw <8 x i32> %aext, %bext
200  %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
201  %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
202  %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
203  %reduced_v = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %absel)
204  ret i32 %reduced_v
205}
206
207define i32 @sabd8h_rdx(<8 x i16> %a, <8 x i16> %b) {
208; CHECK-LABEL: sabd8h_rdx
209; CHECK: sabd.8h
210  %aext = sext <8 x i16> %a to <8 x i32>
211  %bext = sext <8 x i16> %b to <8 x i32>
212  %abdiff = sub nsw <8 x i32> %aext, %bext
213  %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
214  %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
215  %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
216  %reduced_v = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %absel)
217  ret i32 %reduced_v
218}
219
220define i32 @uabdl4s_rdx_i32(<4 x i16> %a, <4 x i16> %b) {
221; CHECK-LABEL: uabdl4s_rdx_i32
222; DAG: uabdl.4s
223
224; GISel doesn't match this pattern yet.
225; GISEL: addv.4s
226  %aext = zext <4 x i16> %a to <4 x i32>
227  %bext = zext <4 x i16> %b to <4 x i32>
228 %abdiff = sub nsw <4 x i32> %aext, %bext
229  %abcmp = icmp slt <4 x i32> %abdiff, zeroinitializer
230  %ababs = sub nsw <4 x i32> zeroinitializer, %abdiff
231  %absel = select <4 x i1> %abcmp, <4 x i32> %ababs, <4 x i32> %abdiff
232  %reduced_v = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %absel)
233  ret i32 %reduced_v
234}
235
236declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
237declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
238
239define i64 @uabd4s_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
240; CHECK: uabd4s_rdx
241; CHECK: uabd.4s
242  %aload = load <4 x i32>, <4 x i32>* %a, align 1
243  %bload = load <4 x i32>, <4 x i32>* %b, align 1
244  %aext = zext <4 x i32> %aload to <4 x i64>
245  %bext = zext <4 x i32> %bload to <4 x i64>
246  %abdiff = sub nsw <4 x i64> %aext, %bext
247  %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
248  %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
249  %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
250  %reduced_v = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %absel)
251  ret i64 %reduced_v
252}
253
254define i64 @sabd4s_rdx(<4 x i32> %a, <4 x i32> %b) {
255; CHECK: sabd4s_rdx
256; CHECK: sabd.4s
257  %aext = sext <4 x i32> %a to <4 x i64>
258  %bext = sext <4 x i32> %b to <4 x i64>
259  %abdiff = sub nsw <4 x i64> %aext, %bext
260  %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
261  %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
262  %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
263  %reduced_v = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %absel)
264  ret i64 %reduced_v
265}
266
267define i64 @uabdl2d_rdx_i64(<2 x i32> %a, <2 x i32> %b) {
268; CHECK-LABEL: uabdl2d_rdx_i64
269; DAG: uabdl.2d
270
271; GISel doesn't match this pattern yet
272; GISEL: addp.2d
273  %aext = zext <2 x i32> %a to <2 x i64>
274  %bext = zext <2 x i32> %b to <2 x i64>
275  %abdiff = sub nsw <2 x i64> %aext, %bext
276  %abcmp = icmp slt <2 x i64> %abdiff, zeroinitializer
277  %ababs = sub nsw <2 x i64> zeroinitializer, %abdiff
278  %absel = select <2 x i1> %abcmp, <2 x i64> %ababs, <2 x i64> %abdiff
279  %reduced_v = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %absel)
280  ret i64 %reduced_v
281}
282
283define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
284;CHECK-LABEL: fabd_2s:
285;CHECK: fabd.2s
286        %tmp1 = load <2 x float>, <2 x float>* %A
287        %tmp2 = load <2 x float>, <2 x float>* %B
288        %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
289        ret <2 x float> %tmp3
290}
291
292define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
293;CHECK-LABEL: fabd_4s:
294;CHECK: fabd.4s
295        %tmp1 = load <4 x float>, <4 x float>* %A
296        %tmp2 = load <4 x float>, <4 x float>* %B
297        %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
298        ret <4 x float> %tmp3
299}
300
301define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
302;CHECK-LABEL: fabd_2d:
303;CHECK: fabd.2d
304        %tmp1 = load <2 x double>, <2 x double>* %A
305        %tmp2 = load <2 x double>, <2 x double>* %B
306        %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
307        ret <2 x double> %tmp3
308}
309
310declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
311declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
312declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
313
314define <2 x float> @fabd_2s_from_fsub_fabs(<2 x float>* %A, <2 x float>* %B) nounwind {
315;CHECK-LABEL: fabd_2s_from_fsub_fabs:
316;CHECK: fabd.2s
317        %tmp1 = load <2 x float>, <2 x float>* %A
318        %tmp2 = load <2 x float>, <2 x float>* %B
319        %sub = fsub <2 x float> %tmp1, %tmp2
320        %abs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %sub)
321        ret <2 x float> %abs
322}
323
324define <4 x float> @fabd_4s_from_fsub_fabs(<4 x float>* %A, <4 x float>* %B) nounwind {
325;CHECK-LABEL: fabd_4s_from_fsub_fabs:
326;CHECK: fabd.4s
327        %tmp1 = load <4 x float>, <4 x float>* %A
328        %tmp2 = load <4 x float>, <4 x float>* %B
329        %sub = fsub <4 x float> %tmp1, %tmp2
330        %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %sub)
331        ret <4 x float> %abs
332}
333
334define <2 x double> @fabd_2d_from_fsub_fabs(<2 x double>* %A, <2 x double>* %B) nounwind {
335;CHECK-LABEL: fabd_2d_from_fsub_fabs:
336;CHECK: fabd.2d
337        %tmp1 = load <2 x double>, <2 x double>* %A
338        %tmp2 = load <2 x double>, <2 x double>* %B
339        %sub = fsub <2 x double> %tmp1, %tmp2
340        %abs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %sub)
341        ret <2 x double> %abs
342}
343
344declare <2 x float> @llvm.fabs.v2f32(<2 x float>) nounwind readnone
345declare <4 x float> @llvm.fabs.v4f32(<4 x float>) nounwind readnone
346declare <2 x double> @llvm.fabs.v2f64(<2 x double>) nounwind readnone
347
348define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
349;CHECK-LABEL: sabd_8b:
350;CHECK: sabd.8b
351        %tmp1 = load <8 x i8>, <8 x i8>* %A
352        %tmp2 = load <8 x i8>, <8 x i8>* %B
353        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
354        ret <8 x i8> %tmp3
355}
356
357define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
358;CHECK-LABEL: sabd_16b:
359;CHECK: sabd.16b
360        %tmp1 = load <16 x i8>, <16 x i8>* %A
361        %tmp2 = load <16 x i8>, <16 x i8>* %B
362        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
363        ret <16 x i8> %tmp3
364}
365
366define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
367;CHECK-LABEL: sabd_4h:
368;CHECK: sabd.4h
369        %tmp1 = load <4 x i16>, <4 x i16>* %A
370        %tmp2 = load <4 x i16>, <4 x i16>* %B
371        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
372        ret <4 x i16> %tmp3
373}
374
375define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
376;CHECK-LABEL: sabd_8h:
377;CHECK: sabd.8h
378        %tmp1 = load <8 x i16>, <8 x i16>* %A
379        %tmp2 = load <8 x i16>, <8 x i16>* %B
380        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
381        ret <8 x i16> %tmp3
382}
383
384define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
385;CHECK-LABEL: sabd_2s:
386;CHECK: sabd.2s
387        %tmp1 = load <2 x i32>, <2 x i32>* %A
388        %tmp2 = load <2 x i32>, <2 x i32>* %B
389        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
390        ret <2 x i32> %tmp3
391}
392
393define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
394;CHECK-LABEL: sabd_4s:
395;CHECK: sabd.4s
396        %tmp1 = load <4 x i32>, <4 x i32>* %A
397        %tmp2 = load <4 x i32>, <4 x i32>* %B
398        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
399        ret <4 x i32> %tmp3
400}
401
402declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
403declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
404declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
405declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
406declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
407declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
408
409define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
410;CHECK-LABEL: uabd_8b:
411;CHECK: uabd.8b
412        %tmp1 = load <8 x i8>, <8 x i8>* %A
413        %tmp2 = load <8 x i8>, <8 x i8>* %B
414        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
415        ret <8 x i8> %tmp3
416}
417
418define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
419;CHECK-LABEL: uabd_16b:
420;CHECK: uabd.16b
421        %tmp1 = load <16 x i8>, <16 x i8>* %A
422        %tmp2 = load <16 x i8>, <16 x i8>* %B
423        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
424        ret <16 x i8> %tmp3
425}
426
427define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
428;CHECK-LABEL: uabd_4h:
429;CHECK: uabd.4h
430        %tmp1 = load <4 x i16>, <4 x i16>* %A
431        %tmp2 = load <4 x i16>, <4 x i16>* %B
432        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
433        ret <4 x i16> %tmp3
434}
435
436define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
437;CHECK-LABEL: uabd_8h:
438;CHECK: uabd.8h
439        %tmp1 = load <8 x i16>, <8 x i16>* %A
440        %tmp2 = load <8 x i16>, <8 x i16>* %B
441        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
442        ret <8 x i16> %tmp3
443}
444
445define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
446;CHECK-LABEL: uabd_2s:
447;CHECK: uabd.2s
448        %tmp1 = load <2 x i32>, <2 x i32>* %A
449        %tmp2 = load <2 x i32>, <2 x i32>* %B
450        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
451        ret <2 x i32> %tmp3
452}
453
454define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
455;CHECK-LABEL: uabd_4s:
456;CHECK: uabd.4s
457        %tmp1 = load <4 x i32>, <4 x i32>* %A
458        %tmp2 = load <4 x i32>, <4 x i32>* %B
459        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
460        ret <4 x i32> %tmp3
461}
462
463declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
464declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
465declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
466declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
467declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
468declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
469
470define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
471;CHECK-LABEL: sqabs_8b:
472;CHECK: sqabs.8b
473        %tmp1 = load <8 x i8>, <8 x i8>* %A
474        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1)
475        ret <8 x i8> %tmp3
476}
477
478define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
479;CHECK-LABEL: sqabs_16b:
480;CHECK: sqabs.16b
481        %tmp1 = load <16 x i8>, <16 x i8>* %A
482        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1)
483        ret <16 x i8> %tmp3
484}
485
486define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
487;CHECK-LABEL: sqabs_4h:
488;CHECK: sqabs.4h
489        %tmp1 = load <4 x i16>, <4 x i16>* %A
490        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1)
491        ret <4 x i16> %tmp3
492}
493
494define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
495;CHECK-LABEL: sqabs_8h:
496;CHECK: sqabs.8h
497        %tmp1 = load <8 x i16>, <8 x i16>* %A
498        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1)
499        ret <8 x i16> %tmp3
500}
501
502define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
503;CHECK-LABEL: sqabs_2s:
504;CHECK: sqabs.2s
505        %tmp1 = load <2 x i32>, <2 x i32>* %A
506        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1)
507        ret <2 x i32> %tmp3
508}
509
510define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
511;CHECK-LABEL: sqabs_4s:
512;CHECK: sqabs.4s
513        %tmp1 = load <4 x i32>, <4 x i32>* %A
514        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1)
515        ret <4 x i32> %tmp3
516}
517
518declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
519declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
520declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
521declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
522declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
523declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
524
525define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
526;CHECK-LABEL: sqneg_8b:
527;CHECK: sqneg.8b
528        %tmp1 = load <8 x i8>, <8 x i8>* %A
529        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1)
530        ret <8 x i8> %tmp3
531}
532
533define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
534;CHECK-LABEL: sqneg_16b:
535;CHECK: sqneg.16b
536        %tmp1 = load <16 x i8>, <16 x i8>* %A
537        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1)
538        ret <16 x i8> %tmp3
539}
540
541define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
542;CHECK-LABEL: sqneg_4h:
543;CHECK: sqneg.4h
544        %tmp1 = load <4 x i16>, <4 x i16>* %A
545        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1)
546        ret <4 x i16> %tmp3
547}
548
549define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
550;CHECK-LABEL: sqneg_8h:
551;CHECK: sqneg.8h
552        %tmp1 = load <8 x i16>, <8 x i16>* %A
553        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1)
554        ret <8 x i16> %tmp3
555}
556
557define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
558;CHECK-LABEL: sqneg_2s:
559;CHECK: sqneg.2s
560        %tmp1 = load <2 x i32>, <2 x i32>* %A
561        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1)
562        ret <2 x i32> %tmp3
563}
564
565define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
566;CHECK-LABEL: sqneg_4s:
567;CHECK: sqneg.4s
568        %tmp1 = load <4 x i32>, <4 x i32>* %A
569        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1)
570        ret <4 x i32> %tmp3
571}
572
573declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
574declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
575declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
576declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
577declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
578declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
579
580define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
581;CHECK-LABEL: abs_8b:
582;CHECK: abs.8b
583        %tmp1 = load <8 x i8>, <8 x i8>* %A
584        %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1)
585        ret <8 x i8> %tmp3
586}
587
588define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
589;CHECK-LABEL: abs_16b:
590;CHECK: abs.16b
591        %tmp1 = load <16 x i8>, <16 x i8>* %A
592        %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1)
593        ret <16 x i8> %tmp3
594}
595
596define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
597;CHECK-LABEL: abs_4h:
598;CHECK: abs.4h
599        %tmp1 = load <4 x i16>, <4 x i16>* %A
600        %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1)
601        ret <4 x i16> %tmp3
602}
603
604define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
605;CHECK-LABEL: abs_8h:
606;CHECK: abs.8h
607        %tmp1 = load <8 x i16>, <8 x i16>* %A
608        %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1)
609        ret <8 x i16> %tmp3
610}
611
612define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
613;CHECK-LABEL: abs_2s:
614;CHECK: abs.2s
615        %tmp1 = load <2 x i32>, <2 x i32>* %A
616        %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1)
617        ret <2 x i32> %tmp3
618}
619
620define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
621;CHECK-LABEL: abs_4s:
622;CHECK: abs.4s
623        %tmp1 = load <4 x i32>, <4 x i32>* %A
624        %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1)
625        ret <4 x i32> %tmp3
626}
627
628define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
629; CHECK-LABEL: abs_1d:
630; CHECK: abs d0, d0
631  %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A)
632  ret <1 x i64> %abs
633}
634
635define i64 @abs_1d_honestly(i64 %A) nounwind {
636; CHECK-LABEL: abs_1d_honestly:
637; CHECK: abs d0, d0
638  %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A)
639  ret i64 %abs
640}
641
642declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone
643declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone
644declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone
645declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone
646declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone
647declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone
648declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
649declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
650
651; FALLBACK-NOT: remark:{{.*}} sabal8h
652define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
653;CHECK-LABEL: sabal8h:
654;CHECK: sabal.8h
655        %tmp1 = load <8 x i8>, <8 x i8>* %A
656        %tmp2 = load <8 x i8>, <8 x i8>* %B
657        %tmp3 = load <8 x i16>, <8 x i16>* %C
658        %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
659        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
660        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
661        ret <8 x i16> %tmp5
662}
663
664; FALLBACK-NOT: remark:{{.*}} sabal4s
665define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
666;CHECK-LABEL: sabal4s:
667;CHECK: sabal.4s
668        %tmp1 = load <4 x i16>, <4 x i16>* %A
669        %tmp2 = load <4 x i16>, <4 x i16>* %B
670        %tmp3 = load <4 x i32>, <4 x i32>* %C
671        %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
672        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
673        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
674        ret <4 x i32> %tmp5
675}
676
677; FALLBACK-NOT: remark:{{.*}} sabal2d
678define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
679;CHECK-LABEL: sabal2d:
680;CHECK: sabal.2d
681        %tmp1 = load <2 x i32>, <2 x i32>* %A
682        %tmp2 = load <2 x i32>, <2 x i32>* %B
683        %tmp3 = load <2 x i64>, <2 x i64>* %C
684        %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
685        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
686        %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
687        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
688        ret <2 x i64> %tmp5
689}
690
691define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
692;CHECK-LABEL: sabal2_8h:
693;CHECK: sabal.8h
694        %load1 = load <16 x i8>, <16 x i8>* %A
695        %load2 = load <16 x i8>, <16 x i8>* %B
696        %tmp3 = load <8 x i16>, <8 x i16>* %C
697        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
698        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
699        %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
700        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
701        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
702        ret <8 x i16> %tmp5
703}
704
705define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
706;CHECK-LABEL: sabal2_4s:
707;CHECK: sabal.4s
708        %load1 = load <8 x i16>, <8 x i16>* %A
709        %load2 = load <8 x i16>, <8 x i16>* %B
710        %tmp3 = load <4 x i32>, <4 x i32>* %C
711        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
712        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
713        %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
714        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
715        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
716        ret <4 x i32> %tmp5
717}
718
719define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
720;CHECK-LABEL: sabal2_2d:
721;CHECK: sabal.2d
722        %load1 = load <4 x i32>, <4 x i32>* %A
723        %load2 = load <4 x i32>, <4 x i32>* %B
724        %tmp3 = load <2 x i64>, <2 x i64>* %C
725        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
726        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
727        %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
728        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
729        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
730        ret <2 x i64> %tmp5
731}
732
733; FALLBACK-NOT: remark:{{.*}} uabal8h
734define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
735;CHECK-LABEL: uabal8h:
736;CHECK: uabal.8h
737        %tmp1 = load <8 x i8>, <8 x i8>* %A
738        %tmp2 = load <8 x i8>, <8 x i8>* %B
739        %tmp3 = load <8 x i16>, <8 x i16>* %C
740        %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
741        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
742        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
743        ret <8 x i16> %tmp5
744}
745
746; FALLBACK-NOT: remark:{{.*}} uabal8s
747define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
748;CHECK-LABEL: uabal4s:
749;CHECK: uabal.4s
750        %tmp1 = load <4 x i16>, <4 x i16>* %A
751        %tmp2 = load <4 x i16>, <4 x i16>* %B
752        %tmp3 = load <4 x i32>, <4 x i32>* %C
753        %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
754        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
755        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
756        ret <4 x i32> %tmp5
757}
758
759; FALLBACK-NOT: remark:{{.*}} uabal2d
760define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
761;CHECK-LABEL: uabal2d:
762;CHECK: uabal.2d
763        %tmp1 = load <2 x i32>, <2 x i32>* %A
764        %tmp2 = load <2 x i32>, <2 x i32>* %B
765        %tmp3 = load <2 x i64>, <2 x i64>* %C
766        %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
767        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
768        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
769        ret <2 x i64> %tmp5
770}
771
772define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
773;CHECK-LABEL: uabal2_8h:
774;CHECK: uabal.8h
775        %load1 = load <16 x i8>, <16 x i8>* %A
776        %load2 = load <16 x i8>, <16 x i8>* %B
777        %tmp3 = load <8 x i16>, <8 x i16>* %C
778        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
779        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
780        %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
781        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
782        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
783        ret <8 x i16> %tmp5
784}
785
786define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
787;CHECK-LABEL: uabal2_4s:
788;CHECK: uabal.4s
789        %load1 = load <8 x i16>, <8 x i16>* %A
790        %load2 = load <8 x i16>, <8 x i16>* %B
791        %tmp3 = load <4 x i32>, <4 x i32>* %C
792        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
793        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
794        %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
795        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
796        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
797        ret <4 x i32> %tmp5
798}
799
800define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
801;CHECK-LABEL: uabal2_2d:
802;CHECK: uabal.2d
803        %load1 = load <4 x i32>, <4 x i32>* %A
804        %load2 = load <4 x i32>, <4 x i32>* %B
805        %tmp3 = load <2 x i64>, <2 x i64>* %C
806        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
807        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
808        %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
809        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
810        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
811        ret <2 x i64> %tmp5
812}
813
814define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
815;CHECK-LABEL: saba_8b:
816;CHECK: saba.8b
817        %tmp1 = load <8 x i8>, <8 x i8>* %A
818        %tmp2 = load <8 x i8>, <8 x i8>* %B
819        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
820        %tmp4 = load <8 x i8>, <8 x i8>* %C
821        %tmp5 = add <8 x i8> %tmp3, %tmp4
822        ret <8 x i8> %tmp5
823}
824
825define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
826;CHECK-LABEL: saba_16b:
827;CHECK: saba.16b
828        %tmp1 = load <16 x i8>, <16 x i8>* %A
829        %tmp2 = load <16 x i8>, <16 x i8>* %B
830        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
831        %tmp4 = load <16 x i8>, <16 x i8>* %C
832        %tmp5 = add <16 x i8> %tmp3, %tmp4
833        ret <16 x i8> %tmp5
834}
835
836define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
837;CHECK-LABEL: saba_4h:
838;CHECK: saba.4h
839        %tmp1 = load <4 x i16>, <4 x i16>* %A
840        %tmp2 = load <4 x i16>, <4 x i16>* %B
841        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
842        %tmp4 = load <4 x i16>, <4 x i16>* %C
843        %tmp5 = add <4 x i16> %tmp3, %tmp4
844        ret <4 x i16> %tmp5
845}
846
847define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
848;CHECK-LABEL: saba_8h:
849;CHECK: saba.8h
850        %tmp1 = load <8 x i16>, <8 x i16>* %A
851        %tmp2 = load <8 x i16>, <8 x i16>* %B
852        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
853        %tmp4 = load <8 x i16>, <8 x i16>* %C
854        %tmp5 = add <8 x i16> %tmp3, %tmp4
855        ret <8 x i16> %tmp5
856}
857
858define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
859;CHECK-LABEL: saba_2s:
860;CHECK: saba.2s
861        %tmp1 = load <2 x i32>, <2 x i32>* %A
862        %tmp2 = load <2 x i32>, <2 x i32>* %B
863        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
864        %tmp4 = load <2 x i32>, <2 x i32>* %C
865        %tmp5 = add <2 x i32> %tmp3, %tmp4
866        ret <2 x i32> %tmp5
867}
868
869define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
870;CHECK-LABEL: saba_4s:
871;CHECK: saba.4s
872        %tmp1 = load <4 x i32>, <4 x i32>* %A
873        %tmp2 = load <4 x i32>, <4 x i32>* %B
874        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
875        %tmp4 = load <4 x i32>, <4 x i32>* %C
876        %tmp5 = add <4 x i32> %tmp3, %tmp4
877        ret <4 x i32> %tmp5
878}
879
880define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
881;CHECK-LABEL: uaba_8b:
882;CHECK: uaba.8b
883        %tmp1 = load <8 x i8>, <8 x i8>* %A
884        %tmp2 = load <8 x i8>, <8 x i8>* %B
885        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
886        %tmp4 = load <8 x i8>, <8 x i8>* %C
887        %tmp5 = add <8 x i8> %tmp3, %tmp4
888        ret <8 x i8> %tmp5
889}
890
891define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
892;CHECK-LABEL: uaba_16b:
893;CHECK: uaba.16b
894        %tmp1 = load <16 x i8>, <16 x i8>* %A
895        %tmp2 = load <16 x i8>, <16 x i8>* %B
896        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
897        %tmp4 = load <16 x i8>, <16 x i8>* %C
898        %tmp5 = add <16 x i8> %tmp3, %tmp4
899        ret <16 x i8> %tmp5
900}
901
902define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
903;CHECK-LABEL: uaba_4h:
904;CHECK: uaba.4h
905        %tmp1 = load <4 x i16>, <4 x i16>* %A
906        %tmp2 = load <4 x i16>, <4 x i16>* %B
907        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
908        %tmp4 = load <4 x i16>, <4 x i16>* %C
909        %tmp5 = add <4 x i16> %tmp3, %tmp4
910        ret <4 x i16> %tmp5
911}
912
913define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
914;CHECK-LABEL: uaba_8h:
915;CHECK: uaba.8h
916        %tmp1 = load <8 x i16>, <8 x i16>* %A
917        %tmp2 = load <8 x i16>, <8 x i16>* %B
918        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
919        %tmp4 = load <8 x i16>, <8 x i16>* %C
920        %tmp5 = add <8 x i16> %tmp3, %tmp4
921        ret <8 x i16> %tmp5
922}
923
924define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
925;CHECK-LABEL: uaba_2s:
926;CHECK: uaba.2s
927        %tmp1 = load <2 x i32>, <2 x i32>* %A
928        %tmp2 = load <2 x i32>, <2 x i32>* %B
929        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
930        %tmp4 = load <2 x i32>, <2 x i32>* %C
931        %tmp5 = add <2 x i32> %tmp3, %tmp4
932        ret <2 x i32> %tmp5
933}
934
935define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
936;CHECK-LABEL: uaba_4s:
937;CHECK: uaba.4s
938        %tmp1 = load <4 x i32>, <4 x i32>* %A
939        %tmp2 = load <4 x i32>, <4 x i32>* %B
940        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
941        %tmp4 = load <4 x i32>, <4 x i32>* %C
942        %tmp5 = add <4 x i32> %tmp3, %tmp4
943        ret <4 x i32> %tmp5
944}
945
946; Scalar FABD
947define float @fabds(float %a, float %b) nounwind {
948; CHECK-LABEL: fabds:
949; CHECK: fabd s0, s0, s1
950  %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind
951  ret float %vabd.i
952}
953
954define double @fabdd(double %a, double %b) nounwind {
955; CHECK-LABEL: fabdd:
956; CHECK: fabd d0, d0, d1
957  %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind
958  ret double %vabd.i
959}
960
961declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone
962declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone
963
964define float @fabds_from_fsub_fabs(float %a, float %b) nounwind {
965; CHECK-LABEL: fabds_from_fsub_fabs:
966; CHECK: fabd s0, s0, s1
967  %sub = fsub float %a, %b
968  %abs = tail call float @llvm.fabs.f32(float %sub)
969  ret float %abs
970}
971
972define double @fabdd_from_fsub_fabs(double %a, double %b) nounwind {
973; CHECK-LABEL: fabdd_from_fsub_fabs:
974; CHECK: fabd d0, d0, d1
975  %sub = fsub double %a, %b
976  %abs = tail call double @llvm.fabs.f64(double %sub)
977  ret double %abs
978}
979
980declare float @llvm.fabs.f32(float) nounwind readnone
981declare double @llvm.fabs.f64(double) nounwind readnone
982
983define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
984; CHECK-LABEL: uabdl_from_extract_dup:
985; CHECK-NOT: ext.16b
986; CHECK: uabdl.2d
987  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
988  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
989
990  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
991
992  %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
993  %res1 = zext <2 x i32> %res to <2 x i64>
994  ret <2 x i64> %res1
995}
996
997define <2 x i64> @uabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
998; CHECK-LABEL: uabdl2_from_extract_dup:
999; CHECK-NOT: ext.16b
1000; CHECK: uabdl2.2d
1001  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1002  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1003
1004  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1005
1006  %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1007  %res1 = zext <2 x i32> %res to <2 x i64>
1008  ret <2 x i64> %res1
1009}
1010
1011define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
1012; CHECK-LABEL: sabdl_from_extract_dup:
1013; CHECK-NOT: ext.16b
1014; CHECK: sabdl.2d
1015  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1016  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1017
1018  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1019
1020  %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1021  %res1 = zext <2 x i32> %res to <2 x i64>
1022  ret <2 x i64> %res1
1023}
1024
1025define <2 x i64> @sabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
1026; CHECK-LABEL: sabdl2_from_extract_dup:
1027; CHECK-NOT: ext.16b
1028; CHECK: sabdl2.2d
1029  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1030  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1031
1032  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1033
1034  %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1035  %res1 = zext <2 x i32> %res to <2 x i64>
1036  ret <2 x i64> %res1
1037}
1038
1039define <2 x i32> @abspattern1(<2 x i32> %a) nounwind {
1040; CHECK-LABEL: abspattern1:
1041; DAG: abs.2s
1042; DAG-NEXT: ret
1043
1044; GISEL-DAG: neg.2s
1045; GISEL-DAG: cmge.2s
1046; GISEL: bif.8b
1047        %tmp1neg = sub <2 x i32> zeroinitializer, %a
1048        %b = icmp sge <2 x i32> %a, zeroinitializer
1049        %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg
1050        ret <2 x i32> %abs
1051}
1052
1053define <4 x i16> @abspattern2(<4 x i16> %a) nounwind {
1054; CHECK-LABEL: abspattern2:
1055; DAG: abs.4h
1056; DAG-NEXT: ret
1057
1058; For GlobalISel, this generates terrible code until we can pattern match this to abs.
1059; GISEL-DAG: neg.4h
1060; GISEL-DAG: cmgt.4h
1061; GISEL: bif.8b
1062        %tmp1neg = sub <4 x i16> zeroinitializer, %a
1063        %b = icmp sgt <4 x i16> %a, zeroinitializer
1064        %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg
1065        ret <4 x i16> %abs
1066}
1067
1068define <8 x i8> @abspattern3(<8 x i8> %a) nounwind {
1069; CHECK-LABEL: abspattern3:
1070; DAG: abs.8b
1071; DAG-NEXT: ret
1072
1073; GISEL-DAG: neg.8b
1074; GISEL-DAG: cmgt.8b
1075; GISEL: bit.8b
1076        %tmp1neg = sub <8 x i8> zeroinitializer, %a
1077        %b = icmp slt <8 x i8> %a, zeroinitializer
1078        %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a
1079        ret <8 x i8> %abs
1080}
1081
1082define <4 x i32> @abspattern4(<4 x i32> %a) nounwind {
1083; CHECK-LABEL: abspattern4:
1084; DAG: abs.4s
1085; DAG-NEXT: ret
1086
1087; GISEL: cmge.4s
1088; GISEL: bif.16b
1089        %tmp1neg = sub <4 x i32> zeroinitializer, %a
1090        %b = icmp sge <4 x i32> %a, zeroinitializer
1091        %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
1092        ret <4 x i32> %abs
1093}
1094
1095define <8 x i16> @abspattern5(<8 x i16> %a) nounwind {
1096; CHECK-LABEL: abspattern5:
1097; DAG: abs.8h
1098; DAG-NEXT: ret
1099
1100; GISEL-DAG: cmgt.8h
1101; GISEL-DAG: neg.8h
1102; GISEL: bif.16b
1103        %tmp1neg = sub <8 x i16> zeroinitializer, %a
1104        %b = icmp sgt <8 x i16> %a, zeroinitializer
1105        %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
1106        ret <8 x i16> %abs
1107}
1108
1109define <16 x i8> @abspattern6(<16 x i8> %a) nounwind {
1110; CHECK-LABEL: abspattern6:
1111; DAG: abs.16b
1112; DAG-NEXT: ret
1113
1114; GISEL: cmgt.16b
1115; GISEL: bit.16b
1116        %tmp1neg = sub <16 x i8> zeroinitializer, %a
1117        %b = icmp slt <16 x i8> %a, zeroinitializer
1118        %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
1119        ret <16 x i8> %abs
1120}
1121
1122define <2 x i64> @abspattern7(<2 x i64> %a) nounwind {
1123; CHECK-LABEL: abspattern7:
1124; DAG: abs.2d
1125; DAG-NEXT: ret
1126
1127; GISEL-DAG: neg.2d
1128; GISEL-DAG: cmge.2d
1129; GISEL: bit.16b
1130        %tmp1neg = sub <2 x i64> zeroinitializer, %a
1131        %b = icmp sle <2 x i64> %a, zeroinitializer
1132        %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a
1133        ret <2 x i64> %abs
1134}
1135