1; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
2
3
4define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
5;CHECK-LABEL: sabdl8h:
6;CHECK: sabdl.8h
7        %tmp1 = load <8 x i8>, <8 x i8>* %A
8        %tmp2 = load <8 x i8>, <8 x i8>* %B
9        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
10        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
11        ret <8 x i16> %tmp4
12}
13
14define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
15;CHECK-LABEL: sabdl4s:
16;CHECK: sabdl.4s
17        %tmp1 = load <4 x i16>, <4 x i16>* %A
18        %tmp2 = load <4 x i16>, <4 x i16>* %B
19        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
20        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
21        ret <4 x i32> %tmp4
22}
23
24define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
25;CHECK-LABEL: sabdl2d:
26;CHECK: sabdl.2d
27        %tmp1 = load <2 x i32>, <2 x i32>* %A
28        %tmp2 = load <2 x i32>, <2 x i32>* %B
29        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
30        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
31        ret <2 x i64> %tmp4
32}
33
34define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
35;CHECK-LABEL: sabdl2_8h:
36;CHECK: sabdl2.8h
37        %load1 = load <16 x i8>, <16 x i8>* %A
38        %load2 = load <16 x i8>, <16 x i8>* %B
39        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
40        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
41        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
42        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
43        ret <8 x i16> %tmp4
44}
45
46define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
47;CHECK-LABEL: sabdl2_4s:
48;CHECK: sabdl2.4s
49        %load1 = load <8 x i16>, <8 x i16>* %A
50        %load2 = load <8 x i16>, <8 x i16>* %B
51        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
52        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
53        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
54        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
55        ret <4 x i32> %tmp4
56}
57
58define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
59;CHECK-LABEL: sabdl2_2d:
60;CHECK: sabdl2.2d
61        %load1 = load <4 x i32>, <4 x i32>* %A
62        %load2 = load <4 x i32>, <4 x i32>* %B
63        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
64        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
65        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
66        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
67        ret <2 x i64> %tmp4
68}
69
70define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
71;CHECK-LABEL: uabdl8h:
72;CHECK: uabdl.8h
73  %tmp1 = load <8 x i8>, <8 x i8>* %A
74  %tmp2 = load <8 x i8>, <8 x i8>* %B
75  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
76  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
77  ret <8 x i16> %tmp4
78}
79
80define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
81;CHECK-LABEL: uabdl4s:
82;CHECK: uabdl.4s
83  %tmp1 = load <4 x i16>, <4 x i16>* %A
84  %tmp2 = load <4 x i16>, <4 x i16>* %B
85  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
86  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
87  ret <4 x i32> %tmp4
88}
89
90define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
91;CHECK-LABEL: uabdl2d:
92;CHECK: uabdl.2d
93  %tmp1 = load <2 x i32>, <2 x i32>* %A
94  %tmp2 = load <2 x i32>, <2 x i32>* %B
95  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
96  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
97  ret <2 x i64> %tmp4
98}
99
100define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
101;CHECK-LABEL: uabdl2_8h:
102;CHECK: uabdl2.8h
103  %load1 = load <16 x i8>, <16 x i8>* %A
104  %load2 = load <16 x i8>, <16 x i8>* %B
105  %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
106  %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
107
108  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
109  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
110  ret <8 x i16> %tmp4
111}
112
113define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
114;CHECK-LABEL: uabdl2_4s:
115;CHECK: uabdl2.4s
116  %load1 = load <8 x i16>, <8 x i16>* %A
117  %load2 = load <8 x i16>, <8 x i16>* %B
118  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
119  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
120  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
121  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
122  ret <4 x i32> %tmp4
123}
124
125define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
126;CHECK-LABEL: uabdl2_2d:
127;CHECK: uabdl2.2d
128  %load1 = load <4 x i32>, <4 x i32>* %A
129  %load2 = load <4 x i32>, <4 x i32>* %B
130  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
131  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
132  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
133  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
134  ret <2 x i64> %tmp4
135}
136
137define i16 @uabdl8h_log2_shuffle(<16 x i8>* %a, <16 x i8>* %b) {
138; CHECK-LABEL: uabdl8h_log2_shuffle
139; CHECK: uabdl2.8h
140; CHECK: uabdl.8h
141  %aload = load <16 x i8>, <16 x i8>* %a, align 1
142  %bload = load <16 x i8>, <16 x i8>* %b, align 1
143  %aext = zext <16 x i8> %aload to <16 x i16>
144  %bext = zext <16 x i8> %bload to <16 x i16>
145  %abdiff = sub nsw <16 x i16> %aext, %bext
146  %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer
147  %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff
148  %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff
149  %rdx.shuf = shufflevector <16 x i16> %absel, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
150  %bin1.rdx = add <16 x i16> %absel, %rdx.shuf
151  %rdx.shufx = shufflevector <16 x i16> %bin1.rdx, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
152  %bin.rdx = add <16 x i16> %bin1.rdx, %rdx.shufx
153  %rdx.shuf136 = shufflevector <16 x i16> %bin.rdx, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
154  %bin.rdx137 = add <16 x i16> %bin.rdx, %rdx.shuf136
155  %rdx.shuf138 = shufflevector <16 x i16> %bin.rdx137, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
156  %bin.rdx139 = add <16 x i16> %bin.rdx137, %rdx.shuf138
157  %reduced_v = extractelement <16 x i16> %bin.rdx139, i16 0
158  ret i16 %reduced_v
159}
160
161define i32 @uabdl4s_log2_shuffle(<8 x i16>* %a, <8 x i16>* %b) {
162; CHECK-LABEL: uabdl4s_log2_shuffle
163; CHECK: uabdl2.4s
164; CHECK: uabdl.4s
165  %aload = load <8 x i16>, <8 x i16>* %a, align 1
166  %bload = load <8 x i16>, <8 x i16>* %b, align 1
167  %aext = zext <8 x i16> %aload to <8 x i32>
168  %bext = zext <8 x i16> %bload to <8 x i32>
169  %abdiff = sub nsw <8 x i32> %aext, %bext
170  %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
171  %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
172  %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
173  %rdx.shuf = shufflevector <8 x i32> %absel, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
174  %bin.rdx = add <8 x i32> %absel, %rdx.shuf
175  %rdx.shuf136 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
176  %bin.rdx137 = add <8 x i32> %bin.rdx, %rdx.shuf136
177  %rdx.shuf138 = shufflevector <8 x i32> %bin.rdx137, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
178  %bin.rdx139 = add <8 x i32> %bin.rdx137, %rdx.shuf138
179  %reduced_v = extractelement <8 x i32> %bin.rdx139, i32 0
180  ret i32 %reduced_v
181}
182
183define i64 @uabdl2d_log2_shuffle(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
184; CHECK: uabdl2d_log2_shuffle
185; CHECK: uabdl2.2d
186; CHECK: uabdl.2d
187  %aload = load <4 x i32>, <4 x i32>* %a, align 1
188  %bload = load <4 x i32>, <4 x i32>* %b, align 1
189  %aext = zext <4 x i32> %aload to <4 x i64>
190  %bext = zext <4 x i32> %bload to <4 x i64>
191  %abdiff = sub nsw <4 x i64> %aext, %bext
192  %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
193  %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
194  %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
195  %rdx.shuf136 = shufflevector <4 x i64> %absel, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
196  %bin.rdx137 = add <4 x i64> %absel, %rdx.shuf136
197  %rdx.shuf138 = shufflevector <4 x i64> %bin.rdx137, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
198  %bin.rdx139 = add <4 x i64> %bin.rdx137, %rdx.shuf138
199  %reduced_v = extractelement <4 x i64> %bin.rdx139, i16 0
200  ret i64 %reduced_v
201}
202
203define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
204;CHECK-LABEL: fabd_2s:
205;CHECK: fabd.2s
206        %tmp1 = load <2 x float>, <2 x float>* %A
207        %tmp2 = load <2 x float>, <2 x float>* %B
208        %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
209        ret <2 x float> %tmp3
210}
211
212define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
213;CHECK-LABEL: fabd_4s:
214;CHECK: fabd.4s
215        %tmp1 = load <4 x float>, <4 x float>* %A
216        %tmp2 = load <4 x float>, <4 x float>* %B
217        %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
218        ret <4 x float> %tmp3
219}
220
221define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
222;CHECK-LABEL: fabd_2d:
223;CHECK: fabd.2d
224        %tmp1 = load <2 x double>, <2 x double>* %A
225        %tmp2 = load <2 x double>, <2 x double>* %B
226        %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
227        ret <2 x double> %tmp3
228}
229
230declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
231declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
232declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
233
234define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
235;CHECK-LABEL: sabd_8b:
236;CHECK: sabd.8b
237        %tmp1 = load <8 x i8>, <8 x i8>* %A
238        %tmp2 = load <8 x i8>, <8 x i8>* %B
239        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
240        ret <8 x i8> %tmp3
241}
242
243define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
244;CHECK-LABEL: sabd_16b:
245;CHECK: sabd.16b
246        %tmp1 = load <16 x i8>, <16 x i8>* %A
247        %tmp2 = load <16 x i8>, <16 x i8>* %B
248        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
249        ret <16 x i8> %tmp3
250}
251
252define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
253;CHECK-LABEL: sabd_4h:
254;CHECK: sabd.4h
255        %tmp1 = load <4 x i16>, <4 x i16>* %A
256        %tmp2 = load <4 x i16>, <4 x i16>* %B
257        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
258        ret <4 x i16> %tmp3
259}
260
261define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
262;CHECK-LABEL: sabd_8h:
263;CHECK: sabd.8h
264        %tmp1 = load <8 x i16>, <8 x i16>* %A
265        %tmp2 = load <8 x i16>, <8 x i16>* %B
266        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
267        ret <8 x i16> %tmp3
268}
269
270define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
271;CHECK-LABEL: sabd_2s:
272;CHECK: sabd.2s
273        %tmp1 = load <2 x i32>, <2 x i32>* %A
274        %tmp2 = load <2 x i32>, <2 x i32>* %B
275        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
276        ret <2 x i32> %tmp3
277}
278
279define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
280;CHECK-LABEL: sabd_4s:
281;CHECK: sabd.4s
282        %tmp1 = load <4 x i32>, <4 x i32>* %A
283        %tmp2 = load <4 x i32>, <4 x i32>* %B
284        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
285        ret <4 x i32> %tmp3
286}
287
288declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
289declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
290declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
291declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
292declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
293declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
294
295define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
296;CHECK-LABEL: uabd_8b:
297;CHECK: uabd.8b
298        %tmp1 = load <8 x i8>, <8 x i8>* %A
299        %tmp2 = load <8 x i8>, <8 x i8>* %B
300        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
301        ret <8 x i8> %tmp3
302}
303
304define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
305;CHECK-LABEL: uabd_16b:
306;CHECK: uabd.16b
307        %tmp1 = load <16 x i8>, <16 x i8>* %A
308        %tmp2 = load <16 x i8>, <16 x i8>* %B
309        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
310        ret <16 x i8> %tmp3
311}
312
313define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
314;CHECK-LABEL: uabd_4h:
315;CHECK: uabd.4h
316        %tmp1 = load <4 x i16>, <4 x i16>* %A
317        %tmp2 = load <4 x i16>, <4 x i16>* %B
318        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
319        ret <4 x i16> %tmp3
320}
321
322define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
323;CHECK-LABEL: uabd_8h:
324;CHECK: uabd.8h
325        %tmp1 = load <8 x i16>, <8 x i16>* %A
326        %tmp2 = load <8 x i16>, <8 x i16>* %B
327        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
328        ret <8 x i16> %tmp3
329}
330
331define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
332;CHECK-LABEL: uabd_2s:
333;CHECK: uabd.2s
334        %tmp1 = load <2 x i32>, <2 x i32>* %A
335        %tmp2 = load <2 x i32>, <2 x i32>* %B
336        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
337        ret <2 x i32> %tmp3
338}
339
340define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
341;CHECK-LABEL: uabd_4s:
342;CHECK: uabd.4s
343        %tmp1 = load <4 x i32>, <4 x i32>* %A
344        %tmp2 = load <4 x i32>, <4 x i32>* %B
345        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
346        ret <4 x i32> %tmp3
347}
348
349declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
350declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
351declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
352declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
353declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
354declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
355
356define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
357;CHECK-LABEL: sqabs_8b:
358;CHECK: sqabs.8b
359        %tmp1 = load <8 x i8>, <8 x i8>* %A
360        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1)
361        ret <8 x i8> %tmp3
362}
363
364define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
365;CHECK-LABEL: sqabs_16b:
366;CHECK: sqabs.16b
367        %tmp1 = load <16 x i8>, <16 x i8>* %A
368        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1)
369        ret <16 x i8> %tmp3
370}
371
372define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
373;CHECK-LABEL: sqabs_4h:
374;CHECK: sqabs.4h
375        %tmp1 = load <4 x i16>, <4 x i16>* %A
376        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1)
377        ret <4 x i16> %tmp3
378}
379
380define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
381;CHECK-LABEL: sqabs_8h:
382;CHECK: sqabs.8h
383        %tmp1 = load <8 x i16>, <8 x i16>* %A
384        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1)
385        ret <8 x i16> %tmp3
386}
387
388define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
389;CHECK-LABEL: sqabs_2s:
390;CHECK: sqabs.2s
391        %tmp1 = load <2 x i32>, <2 x i32>* %A
392        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1)
393        ret <2 x i32> %tmp3
394}
395
396define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
397;CHECK-LABEL: sqabs_4s:
398;CHECK: sqabs.4s
399        %tmp1 = load <4 x i32>, <4 x i32>* %A
400        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1)
401        ret <4 x i32> %tmp3
402}
403
404declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
405declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
406declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
407declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
408declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
409declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
410
411define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
412;CHECK-LABEL: sqneg_8b:
413;CHECK: sqneg.8b
414        %tmp1 = load <8 x i8>, <8 x i8>* %A
415        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1)
416        ret <8 x i8> %tmp3
417}
418
419define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
420;CHECK-LABEL: sqneg_16b:
421;CHECK: sqneg.16b
422        %tmp1 = load <16 x i8>, <16 x i8>* %A
423        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1)
424        ret <16 x i8> %tmp3
425}
426
427define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
428;CHECK-LABEL: sqneg_4h:
429;CHECK: sqneg.4h
430        %tmp1 = load <4 x i16>, <4 x i16>* %A
431        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1)
432        ret <4 x i16> %tmp3
433}
434
435define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
436;CHECK-LABEL: sqneg_8h:
437;CHECK: sqneg.8h
438        %tmp1 = load <8 x i16>, <8 x i16>* %A
439        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1)
440        ret <8 x i16> %tmp3
441}
442
443define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
444;CHECK-LABEL: sqneg_2s:
445;CHECK: sqneg.2s
446        %tmp1 = load <2 x i32>, <2 x i32>* %A
447        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1)
448        ret <2 x i32> %tmp3
449}
450
451define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
452;CHECK-LABEL: sqneg_4s:
453;CHECK: sqneg.4s
454        %tmp1 = load <4 x i32>, <4 x i32>* %A
455        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1)
456        ret <4 x i32> %tmp3
457}
458
459declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
460declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
461declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
462declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
463declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
464declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
465
466define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
467;CHECK-LABEL: abs_8b:
468;CHECK: abs.8b
469        %tmp1 = load <8 x i8>, <8 x i8>* %A
470        %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1)
471        ret <8 x i8> %tmp3
472}
473
474define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
475;CHECK-LABEL: abs_16b:
476;CHECK: abs.16b
477        %tmp1 = load <16 x i8>, <16 x i8>* %A
478        %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1)
479        ret <16 x i8> %tmp3
480}
481
482define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
483;CHECK-LABEL: abs_4h:
484;CHECK: abs.4h
485        %tmp1 = load <4 x i16>, <4 x i16>* %A
486        %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1)
487        ret <4 x i16> %tmp3
488}
489
490define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
491;CHECK-LABEL: abs_8h:
492;CHECK: abs.8h
493        %tmp1 = load <8 x i16>, <8 x i16>* %A
494        %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1)
495        ret <8 x i16> %tmp3
496}
497
498define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
499;CHECK-LABEL: abs_2s:
500;CHECK: abs.2s
501        %tmp1 = load <2 x i32>, <2 x i32>* %A
502        %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1)
503        ret <2 x i32> %tmp3
504}
505
506define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
507;CHECK-LABEL: abs_4s:
508;CHECK: abs.4s
509        %tmp1 = load <4 x i32>, <4 x i32>* %A
510        %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1)
511        ret <4 x i32> %tmp3
512}
513
514define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
515; CHECK-LABEL: abs_1d:
516; CHECK: abs d0, d0
517  %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A)
518  ret <1 x i64> %abs
519}
520
521define i64 @abs_1d_honestly(i64 %A) nounwind {
522; CHECK-LABEL: abs_1d_honestly:
523; CHECK: abs d0, d0
524  %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A)
525  ret i64 %abs
526}
527
528declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone
529declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone
530declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone
531declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone
532declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone
533declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone
534declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
535declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
536
537define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
538;CHECK-LABEL: sabal8h:
539;CHECK: sabal.8h
540        %tmp1 = load <8 x i8>, <8 x i8>* %A
541        %tmp2 = load <8 x i8>, <8 x i8>* %B
542        %tmp3 = load <8 x i16>, <8 x i16>* %C
543        %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
544        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
545        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
546        ret <8 x i16> %tmp5
547}
548
549define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
550;CHECK-LABEL: sabal4s:
551;CHECK: sabal.4s
552        %tmp1 = load <4 x i16>, <4 x i16>* %A
553        %tmp2 = load <4 x i16>, <4 x i16>* %B
554        %tmp3 = load <4 x i32>, <4 x i32>* %C
555        %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
556        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
557        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
558        ret <4 x i32> %tmp5
559}
560
561define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
562;CHECK-LABEL: sabal2d:
563;CHECK: sabal.2d
564        %tmp1 = load <2 x i32>, <2 x i32>* %A
565        %tmp2 = load <2 x i32>, <2 x i32>* %B
566        %tmp3 = load <2 x i64>, <2 x i64>* %C
567        %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
568        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
569        %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
570        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
571        ret <2 x i64> %tmp5
572}
573
574define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
575;CHECK-LABEL: sabal2_8h:
576;CHECK: sabal2.8h
577        %load1 = load <16 x i8>, <16 x i8>* %A
578        %load2 = load <16 x i8>, <16 x i8>* %B
579        %tmp3 = load <8 x i16>, <8 x i16>* %C
580        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
581        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
582        %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
583        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
584        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
585        ret <8 x i16> %tmp5
586}
587
588define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
589;CHECK-LABEL: sabal2_4s:
590;CHECK: sabal2.4s
591        %load1 = load <8 x i16>, <8 x i16>* %A
592        %load2 = load <8 x i16>, <8 x i16>* %B
593        %tmp3 = load <4 x i32>, <4 x i32>* %C
594        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
595        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
596        %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
597        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
598        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
599        ret <4 x i32> %tmp5
600}
601
602define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
603;CHECK-LABEL: sabal2_2d:
604;CHECK: sabal2.2d
605        %load1 = load <4 x i32>, <4 x i32>* %A
606        %load2 = load <4 x i32>, <4 x i32>* %B
607        %tmp3 = load <2 x i64>, <2 x i64>* %C
608        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
609        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
610        %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
611        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
612        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
613        ret <2 x i64> %tmp5
614}
615
616define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
617;CHECK-LABEL: uabal8h:
618;CHECK: uabal.8h
619        %tmp1 = load <8 x i8>, <8 x i8>* %A
620        %tmp2 = load <8 x i8>, <8 x i8>* %B
621        %tmp3 = load <8 x i16>, <8 x i16>* %C
622        %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
623        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
624        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
625        ret <8 x i16> %tmp5
626}
627
628define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
629;CHECK-LABEL: uabal4s:
630;CHECK: uabal.4s
631        %tmp1 = load <4 x i16>, <4 x i16>* %A
632        %tmp2 = load <4 x i16>, <4 x i16>* %B
633        %tmp3 = load <4 x i32>, <4 x i32>* %C
634        %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
635        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
636        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
637        ret <4 x i32> %tmp5
638}
639
640define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
641;CHECK-LABEL: uabal2d:
642;CHECK: uabal.2d
643        %tmp1 = load <2 x i32>, <2 x i32>* %A
644        %tmp2 = load <2 x i32>, <2 x i32>* %B
645        %tmp3 = load <2 x i64>, <2 x i64>* %C
646        %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
647        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
648        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
649        ret <2 x i64> %tmp5
650}
651
652define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
653;CHECK-LABEL: uabal2_8h:
654;CHECK: uabal2.8h
655        %load1 = load <16 x i8>, <16 x i8>* %A
656        %load2 = load <16 x i8>, <16 x i8>* %B
657        %tmp3 = load <8 x i16>, <8 x i16>* %C
658        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
659        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
660        %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
661        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
662        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
663        ret <8 x i16> %tmp5
664}
665
666define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
667;CHECK-LABEL: uabal2_4s:
668;CHECK: uabal2.4s
669        %load1 = load <8 x i16>, <8 x i16>* %A
670        %load2 = load <8 x i16>, <8 x i16>* %B
671        %tmp3 = load <4 x i32>, <4 x i32>* %C
672        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
673        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
674        %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
675        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
676        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
677        ret <4 x i32> %tmp5
678}
679
680define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
681;CHECK-LABEL: uabal2_2d:
682;CHECK: uabal2.2d
683        %load1 = load <4 x i32>, <4 x i32>* %A
684        %load2 = load <4 x i32>, <4 x i32>* %B
685        %tmp3 = load <2 x i64>, <2 x i64>* %C
686        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
687        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
688        %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
689        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
690        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
691        ret <2 x i64> %tmp5
692}
693
694define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
695;CHECK-LABEL: saba_8b:
696;CHECK: saba.8b
697        %tmp1 = load <8 x i8>, <8 x i8>* %A
698        %tmp2 = load <8 x i8>, <8 x i8>* %B
699        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
700        %tmp4 = load <8 x i8>, <8 x i8>* %C
701        %tmp5 = add <8 x i8> %tmp3, %tmp4
702        ret <8 x i8> %tmp5
703}
704
705define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
706;CHECK-LABEL: saba_16b:
707;CHECK: saba.16b
708        %tmp1 = load <16 x i8>, <16 x i8>* %A
709        %tmp2 = load <16 x i8>, <16 x i8>* %B
710        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
711        %tmp4 = load <16 x i8>, <16 x i8>* %C
712        %tmp5 = add <16 x i8> %tmp3, %tmp4
713        ret <16 x i8> %tmp5
714}
715
716define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
717;CHECK-LABEL: saba_4h:
718;CHECK: saba.4h
719        %tmp1 = load <4 x i16>, <4 x i16>* %A
720        %tmp2 = load <4 x i16>, <4 x i16>* %B
721        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
722        %tmp4 = load <4 x i16>, <4 x i16>* %C
723        %tmp5 = add <4 x i16> %tmp3, %tmp4
724        ret <4 x i16> %tmp5
725}
726
727define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
728;CHECK-LABEL: saba_8h:
729;CHECK: saba.8h
730        %tmp1 = load <8 x i16>, <8 x i16>* %A
731        %tmp2 = load <8 x i16>, <8 x i16>* %B
732        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
733        %tmp4 = load <8 x i16>, <8 x i16>* %C
734        %tmp5 = add <8 x i16> %tmp3, %tmp4
735        ret <8 x i16> %tmp5
736}
737
738define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
739;CHECK-LABEL: saba_2s:
740;CHECK: saba.2s
741        %tmp1 = load <2 x i32>, <2 x i32>* %A
742        %tmp2 = load <2 x i32>, <2 x i32>* %B
743        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
744        %tmp4 = load <2 x i32>, <2 x i32>* %C
745        %tmp5 = add <2 x i32> %tmp3, %tmp4
746        ret <2 x i32> %tmp5
747}
748
749define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
750;CHECK-LABEL: saba_4s:
751;CHECK: saba.4s
752        %tmp1 = load <4 x i32>, <4 x i32>* %A
753        %tmp2 = load <4 x i32>, <4 x i32>* %B
754        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
755        %tmp4 = load <4 x i32>, <4 x i32>* %C
756        %tmp5 = add <4 x i32> %tmp3, %tmp4
757        ret <4 x i32> %tmp5
758}
759
760define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
761;CHECK-LABEL: uaba_8b:
762;CHECK: uaba.8b
763        %tmp1 = load <8 x i8>, <8 x i8>* %A
764        %tmp2 = load <8 x i8>, <8 x i8>* %B
765        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
766        %tmp4 = load <8 x i8>, <8 x i8>* %C
767        %tmp5 = add <8 x i8> %tmp3, %tmp4
768        ret <8 x i8> %tmp5
769}
770
771define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
772;CHECK-LABEL: uaba_16b:
773;CHECK: uaba.16b
774        %tmp1 = load <16 x i8>, <16 x i8>* %A
775        %tmp2 = load <16 x i8>, <16 x i8>* %B
776        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
777        %tmp4 = load <16 x i8>, <16 x i8>* %C
778        %tmp5 = add <16 x i8> %tmp3, %tmp4
779        ret <16 x i8> %tmp5
780}
781
782define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
783;CHECK-LABEL: uaba_4h:
784;CHECK: uaba.4h
785        %tmp1 = load <4 x i16>, <4 x i16>* %A
786        %tmp2 = load <4 x i16>, <4 x i16>* %B
787        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
788        %tmp4 = load <4 x i16>, <4 x i16>* %C
789        %tmp5 = add <4 x i16> %tmp3, %tmp4
790        ret <4 x i16> %tmp5
791}
792
793define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
794;CHECK-LABEL: uaba_8h:
795;CHECK: uaba.8h
796        %tmp1 = load <8 x i16>, <8 x i16>* %A
797        %tmp2 = load <8 x i16>, <8 x i16>* %B
798        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
799        %tmp4 = load <8 x i16>, <8 x i16>* %C
800        %tmp5 = add <8 x i16> %tmp3, %tmp4
801        ret <8 x i16> %tmp5
802}
803
804define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
805;CHECK-LABEL: uaba_2s:
806;CHECK: uaba.2s
807        %tmp1 = load <2 x i32>, <2 x i32>* %A
808        %tmp2 = load <2 x i32>, <2 x i32>* %B
809        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
810        %tmp4 = load <2 x i32>, <2 x i32>* %C
811        %tmp5 = add <2 x i32> %tmp3, %tmp4
812        ret <2 x i32> %tmp5
813}
814
815define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
816;CHECK-LABEL: uaba_4s:
817;CHECK: uaba.4s
818        %tmp1 = load <4 x i32>, <4 x i32>* %A
819        %tmp2 = load <4 x i32>, <4 x i32>* %B
820        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
821        %tmp4 = load <4 x i32>, <4 x i32>* %C
822        %tmp5 = add <4 x i32> %tmp3, %tmp4
823        ret <4 x i32> %tmp5
824}
825
826; Scalar FABD
827define float @fabds(float %a, float %b) nounwind {
828; CHECK-LABEL: fabds:
829; CHECK: fabd s0, s0, s1
830  %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind
831  ret float %vabd.i
832}
833
834define double @fabdd(double %a, double %b) nounwind {
835; CHECK-LABEL: fabdd:
836; CHECK: fabd d0, d0, d1
837  %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind
838  ret double %vabd.i
839}
840
841declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone
842declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone
843
844define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
845; CHECK-LABEL: uabdl_from_extract_dup:
846; CHECK-NOT: ext.16b
847; CHECK: uabdl2.2d
848  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
849  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
850
851  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
852
853  %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
854  %res1 = zext <2 x i32> %res to <2 x i64>
855  ret <2 x i64> %res1
856}
857
858define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
859; CHECK-LABEL: sabdl_from_extract_dup:
860; CHECK-NOT: ext.16b
861; CHECK: sabdl2.2d
862  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
863  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
864
865  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
866
867  %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
868  %res1 = zext <2 x i32> %res to <2 x i64>
869  ret <2 x i64> %res1
870}
871
872define <2 x i32> @abspattern1(<2 x i32> %a) nounwind {
873; CHECK-LABEL: abspattern1:
874; CHECK: abs.2s
875; CHECK-NEXT: ret
876        %tmp1neg = sub <2 x i32> zeroinitializer, %a
877        %b = icmp sge <2 x i32> %a, zeroinitializer
878        %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg
879        ret <2 x i32> %abs
880}
881
882define <4 x i16> @abspattern2(<4 x i16> %a) nounwind {
883; CHECK-LABEL: abspattern2:
884; CHECK: abs.4h
885; CHECK-NEXT: ret
886        %tmp1neg = sub <4 x i16> zeroinitializer, %a
887        %b = icmp sgt <4 x i16> %a, zeroinitializer
888        %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg
889        ret <4 x i16> %abs
890}
891
892define <8 x i8> @abspattern3(<8 x i8> %a) nounwind {
893; CHECK-LABEL: abspattern3:
894; CHECK: abs.8b
895; CHECK-NEXT: ret
896        %tmp1neg = sub <8 x i8> zeroinitializer, %a
897        %b = icmp slt <8 x i8> %a, zeroinitializer
898        %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a
899        ret <8 x i8> %abs
900}
901
902define <4 x i32> @abspattern4(<4 x i32> %a) nounwind {
903; CHECK-LABEL: abspattern4:
904; CHECK: abs.4s
905; CHECK-NEXT: ret
906        %tmp1neg = sub <4 x i32> zeroinitializer, %a
907        %b = icmp sge <4 x i32> %a, zeroinitializer
908        %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
909        ret <4 x i32> %abs
910}
911
912define <8 x i16> @abspattern5(<8 x i16> %a) nounwind {
913; CHECK-LABEL: abspattern5:
914; CHECK: abs.8h
915; CHECK-NEXT: ret
916        %tmp1neg = sub <8 x i16> zeroinitializer, %a
917        %b = icmp sgt <8 x i16> %a, zeroinitializer
918        %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
919        ret <8 x i16> %abs
920}
921
922define <16 x i8> @abspattern6(<16 x i8> %a) nounwind {
923; CHECK-LABEL: abspattern6:
924; CHECK: abs.16b
925; CHECK-NEXT: ret
926        %tmp1neg = sub <16 x i8> zeroinitializer, %a
927        %b = icmp slt <16 x i8> %a, zeroinitializer
928        %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
929        ret <16 x i8> %abs
930}
931
932define <2 x i64> @abspattern7(<2 x i64> %a) nounwind {
933; CHECK-LABEL: abspattern7:
934; CHECK: abs.2d
935; CHECK-NEXT: ret
936        %tmp1neg = sub <2 x i64> zeroinitializer, %a
937        %b = icmp sle <2 x i64> %a, zeroinitializer
938        %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a
939        ret <2 x i64> %abs
940}
941