1; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck -check-prefixes=CHECK,DAG %s 2; RUN: llc < %s -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* -mtriple=arm64-eabi -aarch64-neon-syntax=apple 2>&1 | FileCheck %s --check-prefixes=FALLBACK,CHECK,GISEL 3 4; FALLBACK-NOT: remark:{{.*}} G_ZEXT 5; FALLBACK-NOT: remark:{{.*}} sabdl8h 6define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 7;CHECK-LABEL: sabdl8h: 8;CHECK: sabdl.8h 9 %tmp1 = load <8 x i8>, <8 x i8>* %A 10 %tmp2 = load <8 x i8>, <8 x i8>* %B 11 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 12 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 13 ret <8 x i16> %tmp4 14} 15 16; FALLBACK-NOT: remark:{{.*}} sabdl4s 17define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 18;CHECK-LABEL: sabdl4s: 19;CHECK: sabdl.4s 20 %tmp1 = load <4 x i16>, <4 x i16>* %A 21 %tmp2 = load <4 x i16>, <4 x i16>* %B 22 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 23 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 24 ret <4 x i32> %tmp4 25} 26 27; FALLBACK-NOT: remark:{{.*}} sabdl2d 28define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 29;CHECK-LABEL: sabdl2d: 30;CHECK: sabdl.2d 31 %tmp1 = load <2 x i32>, <2 x i32>* %A 32 %tmp2 = load <2 x i32>, <2 x i32>* %B 33 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 34 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 35 ret <2 x i64> %tmp4 36} 37 38define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { 39;CHECK-LABEL: sabdl2_8h: 40;CHECK: sabdl.8h 41 %load1 = load <16 x i8>, <16 x i8>* %A 42 %load2 = load <16 x i8>, <16 x i8>* %B 43 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 44 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 45 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 46 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 47 ret <8 x i16> %tmp4 48} 49 50define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { 51;CHECK-LABEL: sabdl2_4s: 52;CHECK: sabdl.4s 53 %load1 = load <8 x i16>, <8 x i16>* %A 54 %load2 = load <8 x i16>, <8 x i16>* %B 55 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 56 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 57 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 58 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 59 ret <4 x i32> %tmp4 60} 61 62define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { 63;CHECK-LABEL: sabdl2_2d: 64;CHECK: sabdl.2d 65 %load1 = load <4 x i32>, <4 x i32>* %A 66 %load2 = load <4 x i32>, <4 x i32>* %B 67 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 68 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 69 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 70 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 71 ret <2 x i64> %tmp4 72} 73 74; FALLBACK-NOT: remark:{{.*}} uabdl8h) 75define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 76;CHECK-LABEL: uabdl8h: 77;CHECK: uabdl.8h 78 %tmp1 = load <8 x i8>, <8 x i8>* %A 79 %tmp2 = load <8 x i8>, <8 x i8>* %B 80 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 81 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 82 ret <8 x i16> %tmp4 83} 84 85; FALLBACK-NOT: remark:{{.*}} uabdl4s) 86define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 87;CHECK-LABEL: uabdl4s: 88;CHECK: uabdl.4s 89 %tmp1 = load <4 x i16>, <4 x i16>* %A 90 %tmp2 = load <4 x i16>, <4 x i16>* %B 91 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 92 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 93 ret <4 x i32> %tmp4 94} 95 96; FALLBACK-NOT: remark:{{.*}} uabdl2d) 97define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 98;CHECK-LABEL: uabdl2d: 99;CHECK: uabdl.2d 100 %tmp1 = load <2 x i32>, <2 x i32>* %A 101 %tmp2 = load <2 x i32>, <2 x i32>* %B 102 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 103 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 104 ret <2 x i64> %tmp4 105} 106 107define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { 108;CHECK-LABEL: uabdl2_8h: 109;CHECK: uabdl.8h 110 %load1 = load <16 x i8>, <16 x i8>* %A 111 %load2 = load <16 x i8>, <16 x i8>* %B 112 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 113 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 114 115 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 116 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 117 ret <8 x i16> %tmp4 118} 119 120define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { 121;CHECK-LABEL: uabdl2_4s: 122;CHECK: uabdl.4s 123 %load1 = load <8 x i16>, <8 x i16>* %A 124 %load2 = load <8 x i16>, <8 x i16>* %B 125 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 126 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 127 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 128 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 129 ret <4 x i32> %tmp4 130} 131 132define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { 133;CHECK-LABEL: uabdl2_2d: 134;CHECK: uabdl.2d 135 %load1 = load <4 x i32>, <4 x i32>* %A 136 %load2 = load <4 x i32>, <4 x i32>* %B 137 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 138 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 139 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 140 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 141 ret <2 x i64> %tmp4 142} 143 144declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 145declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 146 147define i16 @uabd16b_rdx(<16 x i8>* %a, <16 x i8>* %b) { 148; CHECK-LABEL: uabd16b_rdx 149; CHECK: uabd.16b 150 %aload = load <16 x i8>, <16 x i8>* %a, align 1 151 %bload = load <16 x i8>, <16 x i8>* %b, align 1 152 %aext = zext <16 x i8> %aload to <16 x i16> 153 %bext = zext <16 x i8> %bload to <16 x i16> 154 %abdiff = sub nsw <16 x i16> %aext, %bext 155 %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer 156 %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff 157 %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff 158 %reduced_v = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %absel) 159 ret i16 %reduced_v 160} 161 162define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) { 163; CHECK-LABEL: uabd16b_rdx_i32 164; CHECK: uabd.16b 165 %aext = zext <16 x i8> %a to <16 x i32> 166 %bext = zext <16 x i8> %b to <16 x i32> 167 %abdiff = sub nsw <16 x i32> %aext, %bext 168 %abcmp = icmp slt <16 x i32> %abdiff, zeroinitializer 169 %ababs = sub nsw <16 x i32> zeroinitializer, %abdiff 170 %absel = select <16 x i1> %abcmp, <16 x i32> %ababs, <16 x i32> %abdiff 171 %reduced_v = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %absel) 172 ret i32 %reduced_v 173} 174 175define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) { 176; CHECK-LABEL: sabd16b_rdx_i32 177; CHECK: sabd.16b 178 %aext = sext <16 x i8> %a to <16 x i32> 179 %bext = sext <16 x i8> %b to <16 x i32> 180 %abdiff = sub nsw <16 x i32> %aext, %bext 181 %abcmp = icmp slt <16 x i32> %abdiff, zeroinitializer 182 %ababs = sub nsw <16 x i32> zeroinitializer, %abdiff 183 %absel = select <16 x i1> %abcmp, <16 x i32> %ababs, <16 x i32> %abdiff 184 %reduced_v = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %absel) 185 ret i32 %reduced_v 186} 187 188 189declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 190declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 191 192define i32 @uabd8h_rdx(<8 x i16>* %a, <8 x i16>* %b) { 193; CHECK-LABEL: uabd8h_rdx 194; CHECK: uabd.8h 195 %aload = load <8 x i16>, <8 x i16>* %a, align 1 196 %bload = load <8 x i16>, <8 x i16>* %b, align 1 197 %aext = zext <8 x i16> %aload to <8 x i32> 198 %bext = zext <8 x i16> %bload to <8 x i32> 199 %abdiff = sub nsw <8 x i32> %aext, %bext 200 %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer 201 %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff 202 %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff 203 %reduced_v = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %absel) 204 ret i32 %reduced_v 205} 206 207define i32 @sabd8h_rdx(<8 x i16> %a, <8 x i16> %b) { 208; CHECK-LABEL: sabd8h_rdx 209; CHECK: sabd.8h 210 %aext = sext <8 x i16> %a to <8 x i32> 211 %bext = sext <8 x i16> %b to <8 x i32> 212 %abdiff = sub nsw <8 x i32> %aext, %bext 213 %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer 214 %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff 215 %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff 216 %reduced_v = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %absel) 217 ret i32 %reduced_v 218} 219 220define i32 @uabdl4s_rdx_i32(<4 x i16> %a, <4 x i16> %b) { 221; CHECK-LABEL: uabdl4s_rdx_i32 222; DAG: uabdl.4s 223 224; GISel doesn't match this pattern yet. 225; GISEL: addv.4s 226 %aext = zext <4 x i16> %a to <4 x i32> 227 %bext = zext <4 x i16> %b to <4 x i32> 228 %abdiff = sub nsw <4 x i32> %aext, %bext 229 %abcmp = icmp slt <4 x i32> %abdiff, zeroinitializer 230 %ababs = sub nsw <4 x i32> zeroinitializer, %abdiff 231 %absel = select <4 x i1> %abcmp, <4 x i32> %ababs, <4 x i32> %abdiff 232 %reduced_v = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %absel) 233 ret i32 %reduced_v 234} 235 236declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) 237declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) 238 239define i64 @uabd4s_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) { 240; CHECK: uabd4s_rdx 241; CHECK: uabd.4s 242 %aload = load <4 x i32>, <4 x i32>* %a, align 1 243 %bload = load <4 x i32>, <4 x i32>* %b, align 1 244 %aext = zext <4 x i32> %aload to <4 x i64> 245 %bext = zext <4 x i32> %bload to <4 x i64> 246 %abdiff = sub nsw <4 x i64> %aext, %bext 247 %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer 248 %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff 249 %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff 250 %reduced_v = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %absel) 251 ret i64 %reduced_v 252} 253 254define i64 @sabd4s_rdx(<4 x i32> %a, <4 x i32> %b) { 255; CHECK: sabd4s_rdx 256; CHECK: sabd.4s 257 %aext = sext <4 x i32> %a to <4 x i64> 258 %bext = sext <4 x i32> %b to <4 x i64> 259 %abdiff = sub nsw <4 x i64> %aext, %bext 260 %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer 261 %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff 262 %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff 263 %reduced_v = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %absel) 264 ret i64 %reduced_v 265} 266 267define i64 @uabdl2d_rdx_i64(<2 x i32> %a, <2 x i32> %b) { 268; CHECK-LABEL: uabdl2d_rdx_i64 269; DAG: uabdl.2d 270 271; GISel doesn't match this pattern yet 272; GISEL: addp.2d 273 %aext = zext <2 x i32> %a to <2 x i64> 274 %bext = zext <2 x i32> %b to <2 x i64> 275 %abdiff = sub nsw <2 x i64> %aext, %bext 276 %abcmp = icmp slt <2 x i64> %abdiff, zeroinitializer 277 %ababs = sub nsw <2 x i64> zeroinitializer, %abdiff 278 %absel = select <2 x i1> %abcmp, <2 x i64> %ababs, <2 x i64> %abdiff 279 %reduced_v = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %absel) 280 ret i64 %reduced_v 281} 282 283define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind { 284;CHECK-LABEL: fabd_2s: 285;CHECK: fabd.2s 286 %tmp1 = load <2 x float>, <2 x float>* %A 287 %tmp2 = load <2 x float>, <2 x float>* %B 288 %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) 289 ret <2 x float> %tmp3 290} 291 292define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind { 293;CHECK-LABEL: fabd_4s: 294;CHECK: fabd.4s 295 %tmp1 = load <4 x float>, <4 x float>* %A 296 %tmp2 = load <4 x float>, <4 x float>* %B 297 %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) 298 ret <4 x float> %tmp3 299} 300 301define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind { 302;CHECK-LABEL: fabd_2d: 303;CHECK: fabd.2d 304 %tmp1 = load <2 x double>, <2 x double>* %A 305 %tmp2 = load <2 x double>, <2 x double>* %B 306 %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2) 307 ret <2 x double> %tmp3 308} 309 310declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone 311declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone 312declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone 313 314define <2 x float> @fabd_2s_from_fsub_fabs(<2 x float>* %A, <2 x float>* %B) nounwind { 315;CHECK-LABEL: fabd_2s_from_fsub_fabs: 316;CHECK: fabd.2s 317 %tmp1 = load <2 x float>, <2 x float>* %A 318 %tmp2 = load <2 x float>, <2 x float>* %B 319 %sub = fsub <2 x float> %tmp1, %tmp2 320 %abs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %sub) 321 ret <2 x float> %abs 322} 323 324define <4 x float> @fabd_4s_from_fsub_fabs(<4 x float>* %A, <4 x float>* %B) nounwind { 325;CHECK-LABEL: fabd_4s_from_fsub_fabs: 326;CHECK: fabd.4s 327 %tmp1 = load <4 x float>, <4 x float>* %A 328 %tmp2 = load <4 x float>, <4 x float>* %B 329 %sub = fsub <4 x float> %tmp1, %tmp2 330 %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %sub) 331 ret <4 x float> %abs 332} 333 334define <2 x double> @fabd_2d_from_fsub_fabs(<2 x double>* %A, <2 x double>* %B) nounwind { 335;CHECK-LABEL: fabd_2d_from_fsub_fabs: 336;CHECK: fabd.2d 337 %tmp1 = load <2 x double>, <2 x double>* %A 338 %tmp2 = load <2 x double>, <2 x double>* %B 339 %sub = fsub <2 x double> %tmp1, %tmp2 340 %abs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %sub) 341 ret <2 x double> %abs 342} 343 344declare <2 x float> @llvm.fabs.v2f32(<2 x float>) nounwind readnone 345declare <4 x float> @llvm.fabs.v4f32(<4 x float>) nounwind readnone 346declare <2 x double> @llvm.fabs.v2f64(<2 x double>) nounwind readnone 347 348define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { 349;CHECK-LABEL: sabd_8b: 350;CHECK: sabd.8b 351 %tmp1 = load <8 x i8>, <8 x i8>* %A 352 %tmp2 = load <8 x i8>, <8 x i8>* %B 353 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 354 ret <8 x i8> %tmp3 355} 356 357define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { 358;CHECK-LABEL: sabd_16b: 359;CHECK: sabd.16b 360 %tmp1 = load <16 x i8>, <16 x i8>* %A 361 %tmp2 = load <16 x i8>, <16 x i8>* %B 362 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 363 ret <16 x i8> %tmp3 364} 365 366define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 367;CHECK-LABEL: sabd_4h: 368;CHECK: sabd.4h 369 %tmp1 = load <4 x i16>, <4 x i16>* %A 370 %tmp2 = load <4 x i16>, <4 x i16>* %B 371 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 372 ret <4 x i16> %tmp3 373} 374 375define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 376;CHECK-LABEL: sabd_8h: 377;CHECK: sabd.8h 378 %tmp1 = load <8 x i16>, <8 x i16>* %A 379 %tmp2 = load <8 x i16>, <8 x i16>* %B 380 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 381 ret <8 x i16> %tmp3 382} 383 384define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 385;CHECK-LABEL: sabd_2s: 386;CHECK: sabd.2s 387 %tmp1 = load <2 x i32>, <2 x i32>* %A 388 %tmp2 = load <2 x i32>, <2 x i32>* %B 389 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 390 ret <2 x i32> %tmp3 391} 392 393define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 394;CHECK-LABEL: sabd_4s: 395;CHECK: sabd.4s 396 %tmp1 = load <4 x i32>, <4 x i32>* %A 397 %tmp2 = load <4 x i32>, <4 x i32>* %B 398 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 399 ret <4 x i32> %tmp3 400} 401 402declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 403declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 404declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 405declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 406declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 407declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 408 409define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { 410;CHECK-LABEL: uabd_8b: 411;CHECK: uabd.8b 412 %tmp1 = load <8 x i8>, <8 x i8>* %A 413 %tmp2 = load <8 x i8>, <8 x i8>* %B 414 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 415 ret <8 x i8> %tmp3 416} 417 418define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { 419;CHECK-LABEL: uabd_16b: 420;CHECK: uabd.16b 421 %tmp1 = load <16 x i8>, <16 x i8>* %A 422 %tmp2 = load <16 x i8>, <16 x i8>* %B 423 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 424 ret <16 x i8> %tmp3 425} 426 427define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 428;CHECK-LABEL: uabd_4h: 429;CHECK: uabd.4h 430 %tmp1 = load <4 x i16>, <4 x i16>* %A 431 %tmp2 = load <4 x i16>, <4 x i16>* %B 432 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 433 ret <4 x i16> %tmp3 434} 435 436define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 437;CHECK-LABEL: uabd_8h: 438;CHECK: uabd.8h 439 %tmp1 = load <8 x i16>, <8 x i16>* %A 440 %tmp2 = load <8 x i16>, <8 x i16>* %B 441 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 442 ret <8 x i16> %tmp3 443} 444 445define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 446;CHECK-LABEL: uabd_2s: 447;CHECK: uabd.2s 448 %tmp1 = load <2 x i32>, <2 x i32>* %A 449 %tmp2 = load <2 x i32>, <2 x i32>* %B 450 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 451 ret <2 x i32> %tmp3 452} 453 454define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 455;CHECK-LABEL: uabd_4s: 456;CHECK: uabd.4s 457 %tmp1 = load <4 x i32>, <4 x i32>* %A 458 %tmp2 = load <4 x i32>, <4 x i32>* %B 459 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 460 ret <4 x i32> %tmp3 461} 462 463declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 464declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 465declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 466declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 467declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 468declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 469 470define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind { 471;CHECK-LABEL: sqabs_8b: 472;CHECK: sqabs.8b 473 %tmp1 = load <8 x i8>, <8 x i8>* %A 474 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1) 475 ret <8 x i8> %tmp3 476} 477 478define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind { 479;CHECK-LABEL: sqabs_16b: 480;CHECK: sqabs.16b 481 %tmp1 = load <16 x i8>, <16 x i8>* %A 482 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1) 483 ret <16 x i8> %tmp3 484} 485 486define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind { 487;CHECK-LABEL: sqabs_4h: 488;CHECK: sqabs.4h 489 %tmp1 = load <4 x i16>, <4 x i16>* %A 490 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1) 491 ret <4 x i16> %tmp3 492} 493 494define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind { 495;CHECK-LABEL: sqabs_8h: 496;CHECK: sqabs.8h 497 %tmp1 = load <8 x i16>, <8 x i16>* %A 498 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1) 499 ret <8 x i16> %tmp3 500} 501 502define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind { 503;CHECK-LABEL: sqabs_2s: 504;CHECK: sqabs.2s 505 %tmp1 = load <2 x i32>, <2 x i32>* %A 506 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1) 507 ret <2 x i32> %tmp3 508} 509 510define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind { 511;CHECK-LABEL: sqabs_4s: 512;CHECK: sqabs.4s 513 %tmp1 = load <4 x i32>, <4 x i32>* %A 514 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1) 515 ret <4 x i32> %tmp3 516} 517 518declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone 519declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone 520declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone 521declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone 522declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone 523declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone 524 525define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind { 526;CHECK-LABEL: sqneg_8b: 527;CHECK: sqneg.8b 528 %tmp1 = load <8 x i8>, <8 x i8>* %A 529 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1) 530 ret <8 x i8> %tmp3 531} 532 533define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind { 534;CHECK-LABEL: sqneg_16b: 535;CHECK: sqneg.16b 536 %tmp1 = load <16 x i8>, <16 x i8>* %A 537 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1) 538 ret <16 x i8> %tmp3 539} 540 541define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind { 542;CHECK-LABEL: sqneg_4h: 543;CHECK: sqneg.4h 544 %tmp1 = load <4 x i16>, <4 x i16>* %A 545 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1) 546 ret <4 x i16> %tmp3 547} 548 549define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind { 550;CHECK-LABEL: sqneg_8h: 551;CHECK: sqneg.8h 552 %tmp1 = load <8 x i16>, <8 x i16>* %A 553 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1) 554 ret <8 x i16> %tmp3 555} 556 557define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind { 558;CHECK-LABEL: sqneg_2s: 559;CHECK: sqneg.2s 560 %tmp1 = load <2 x i32>, <2 x i32>* %A 561 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1) 562 ret <2 x i32> %tmp3 563} 564 565define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind { 566;CHECK-LABEL: sqneg_4s: 567;CHECK: sqneg.4s 568 %tmp1 = load <4 x i32>, <4 x i32>* %A 569 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1) 570 ret <4 x i32> %tmp3 571} 572 573declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone 574declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone 575declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone 576declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone 577declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone 578declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone 579 580define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind { 581;CHECK-LABEL: abs_8b: 582;CHECK: abs.8b 583 %tmp1 = load <8 x i8>, <8 x i8>* %A 584 %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1) 585 ret <8 x i8> %tmp3 586} 587 588define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind { 589;CHECK-LABEL: abs_16b: 590;CHECK: abs.16b 591 %tmp1 = load <16 x i8>, <16 x i8>* %A 592 %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1) 593 ret <16 x i8> %tmp3 594} 595 596define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind { 597;CHECK-LABEL: abs_4h: 598;CHECK: abs.4h 599 %tmp1 = load <4 x i16>, <4 x i16>* %A 600 %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1) 601 ret <4 x i16> %tmp3 602} 603 604define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind { 605;CHECK-LABEL: abs_8h: 606;CHECK: abs.8h 607 %tmp1 = load <8 x i16>, <8 x i16>* %A 608 %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1) 609 ret <8 x i16> %tmp3 610} 611 612define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind { 613;CHECK-LABEL: abs_2s: 614;CHECK: abs.2s 615 %tmp1 = load <2 x i32>, <2 x i32>* %A 616 %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1) 617 ret <2 x i32> %tmp3 618} 619 620define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind { 621;CHECK-LABEL: abs_4s: 622;CHECK: abs.4s 623 %tmp1 = load <4 x i32>, <4 x i32>* %A 624 %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1) 625 ret <4 x i32> %tmp3 626} 627 628define <1 x i64> @abs_1d(<1 x i64> %A) nounwind { 629; CHECK-LABEL: abs_1d: 630; CHECK: abs d0, d0 631 %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A) 632 ret <1 x i64> %abs 633} 634 635define i64 @abs_1d_honestly(i64 %A) nounwind { 636; CHECK-LABEL: abs_1d_honestly: 637; CHECK: abs d0, d0 638 %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A) 639 ret i64 %abs 640} 641 642declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone 643declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone 644declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone 645declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone 646declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone 647declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone 648declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone 649declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone 650 651; FALLBACK-NOT: remark:{{.*}} sabal8h 652define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind { 653;CHECK-LABEL: sabal8h: 654;CHECK: sabal.8h 655 %tmp1 = load <8 x i8>, <8 x i8>* %A 656 %tmp2 = load <8 x i8>, <8 x i8>* %B 657 %tmp3 = load <8 x i16>, <8 x i16>* %C 658 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 659 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 660 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 661 ret <8 x i16> %tmp5 662} 663 664; FALLBACK-NOT: remark:{{.*}} sabal4s 665define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 666;CHECK-LABEL: sabal4s: 667;CHECK: sabal.4s 668 %tmp1 = load <4 x i16>, <4 x i16>* %A 669 %tmp2 = load <4 x i16>, <4 x i16>* %B 670 %tmp3 = load <4 x i32>, <4 x i32>* %C 671 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 672 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 673 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 674 ret <4 x i32> %tmp5 675} 676 677; FALLBACK-NOT: remark:{{.*}} sabal2d 678define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 679;CHECK-LABEL: sabal2d: 680;CHECK: sabal.2d 681 %tmp1 = load <2 x i32>, <2 x i32>* %A 682 %tmp2 = load <2 x i32>, <2 x i32>* %B 683 %tmp3 = load <2 x i64>, <2 x i64>* %C 684 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 685 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 686 %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64> 687 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 688 ret <2 x i64> %tmp5 689} 690 691define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind { 692;CHECK-LABEL: sabal2_8h: 693;CHECK: sabal.8h 694 %load1 = load <16 x i8>, <16 x i8>* %A 695 %load2 = load <16 x i8>, <16 x i8>* %B 696 %tmp3 = load <8 x i16>, <8 x i16>* %C 697 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 698 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 699 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 700 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 701 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 702 ret <8 x i16> %tmp5 703} 704 705define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { 706;CHECK-LABEL: sabal2_4s: 707;CHECK: sabal.4s 708 %load1 = load <8 x i16>, <8 x i16>* %A 709 %load2 = load <8 x i16>, <8 x i16>* %B 710 %tmp3 = load <4 x i32>, <4 x i32>* %C 711 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 712 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 713 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 714 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 715 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 716 ret <4 x i32> %tmp5 717} 718 719define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { 720;CHECK-LABEL: sabal2_2d: 721;CHECK: sabal.2d 722 %load1 = load <4 x i32>, <4 x i32>* %A 723 %load2 = load <4 x i32>, <4 x i32>* %B 724 %tmp3 = load <2 x i64>, <2 x i64>* %C 725 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 726 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 727 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 728 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 729 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 730 ret <2 x i64> %tmp5 731} 732 733; FALLBACK-NOT: remark:{{.*}} uabal8h 734define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind { 735;CHECK-LABEL: uabal8h: 736;CHECK: uabal.8h 737 %tmp1 = load <8 x i8>, <8 x i8>* %A 738 %tmp2 = load <8 x i8>, <8 x i8>* %B 739 %tmp3 = load <8 x i16>, <8 x i16>* %C 740 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 741 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 742 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 743 ret <8 x i16> %tmp5 744} 745 746; FALLBACK-NOT: remark:{{.*}} uabal8s 747define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 748;CHECK-LABEL: uabal4s: 749;CHECK: uabal.4s 750 %tmp1 = load <4 x i16>, <4 x i16>* %A 751 %tmp2 = load <4 x i16>, <4 x i16>* %B 752 %tmp3 = load <4 x i32>, <4 x i32>* %C 753 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 754 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 755 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 756 ret <4 x i32> %tmp5 757} 758 759; FALLBACK-NOT: remark:{{.*}} uabal2d 760define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 761;CHECK-LABEL: uabal2d: 762;CHECK: uabal.2d 763 %tmp1 = load <2 x i32>, <2 x i32>* %A 764 %tmp2 = load <2 x i32>, <2 x i32>* %B 765 %tmp3 = load <2 x i64>, <2 x i64>* %C 766 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 767 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 768 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 769 ret <2 x i64> %tmp5 770} 771 772define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind { 773;CHECK-LABEL: uabal2_8h: 774;CHECK: uabal.8h 775 %load1 = load <16 x i8>, <16 x i8>* %A 776 %load2 = load <16 x i8>, <16 x i8>* %B 777 %tmp3 = load <8 x i16>, <8 x i16>* %C 778 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 779 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 780 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 781 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 782 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 783 ret <8 x i16> %tmp5 784} 785 786define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { 787;CHECK-LABEL: uabal2_4s: 788;CHECK: uabal.4s 789 %load1 = load <8 x i16>, <8 x i16>* %A 790 %load2 = load <8 x i16>, <8 x i16>* %B 791 %tmp3 = load <4 x i32>, <4 x i32>* %C 792 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 793 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 794 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 795 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 796 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 797 ret <4 x i32> %tmp5 798} 799 800define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { 801;CHECK-LABEL: uabal2_2d: 802;CHECK: uabal.2d 803 %load1 = load <4 x i32>, <4 x i32>* %A 804 %load2 = load <4 x i32>, <4 x i32>* %B 805 %tmp3 = load <2 x i64>, <2 x i64>* %C 806 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 807 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 808 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 809 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 810 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 811 ret <2 x i64> %tmp5 812} 813 814define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { 815;CHECK-LABEL: saba_8b: 816;CHECK: saba.8b 817 %tmp1 = load <8 x i8>, <8 x i8>* %A 818 %tmp2 = load <8 x i8>, <8 x i8>* %B 819 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 820 %tmp4 = load <8 x i8>, <8 x i8>* %C 821 %tmp5 = add <8 x i8> %tmp3, %tmp4 822 ret <8 x i8> %tmp5 823} 824 825define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { 826;CHECK-LABEL: saba_16b: 827;CHECK: saba.16b 828 %tmp1 = load <16 x i8>, <16 x i8>* %A 829 %tmp2 = load <16 x i8>, <16 x i8>* %B 830 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 831 %tmp4 = load <16 x i8>, <16 x i8>* %C 832 %tmp5 = add <16 x i8> %tmp3, %tmp4 833 ret <16 x i8> %tmp5 834} 835 836define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 837;CHECK-LABEL: saba_4h: 838;CHECK: saba.4h 839 %tmp1 = load <4 x i16>, <4 x i16>* %A 840 %tmp2 = load <4 x i16>, <4 x i16>* %B 841 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 842 %tmp4 = load <4 x i16>, <4 x i16>* %C 843 %tmp5 = add <4 x i16> %tmp3, %tmp4 844 ret <4 x i16> %tmp5 845} 846 847define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { 848;CHECK-LABEL: saba_8h: 849;CHECK: saba.8h 850 %tmp1 = load <8 x i16>, <8 x i16>* %A 851 %tmp2 = load <8 x i16>, <8 x i16>* %B 852 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 853 %tmp4 = load <8 x i16>, <8 x i16>* %C 854 %tmp5 = add <8 x i16> %tmp3, %tmp4 855 ret <8 x i16> %tmp5 856} 857 858define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 859;CHECK-LABEL: saba_2s: 860;CHECK: saba.2s 861 %tmp1 = load <2 x i32>, <2 x i32>* %A 862 %tmp2 = load <2 x i32>, <2 x i32>* %B 863 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 864 %tmp4 = load <2 x i32>, <2 x i32>* %C 865 %tmp5 = add <2 x i32> %tmp3, %tmp4 866 ret <2 x i32> %tmp5 867} 868 869define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { 870;CHECK-LABEL: saba_4s: 871;CHECK: saba.4s 872 %tmp1 = load <4 x i32>, <4 x i32>* %A 873 %tmp2 = load <4 x i32>, <4 x i32>* %B 874 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 875 %tmp4 = load <4 x i32>, <4 x i32>* %C 876 %tmp5 = add <4 x i32> %tmp3, %tmp4 877 ret <4 x i32> %tmp5 878} 879 880define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { 881;CHECK-LABEL: uaba_8b: 882;CHECK: uaba.8b 883 %tmp1 = load <8 x i8>, <8 x i8>* %A 884 %tmp2 = load <8 x i8>, <8 x i8>* %B 885 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 886 %tmp4 = load <8 x i8>, <8 x i8>* %C 887 %tmp5 = add <8 x i8> %tmp3, %tmp4 888 ret <8 x i8> %tmp5 889} 890 891define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { 892;CHECK-LABEL: uaba_16b: 893;CHECK: uaba.16b 894 %tmp1 = load <16 x i8>, <16 x i8>* %A 895 %tmp2 = load <16 x i8>, <16 x i8>* %B 896 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 897 %tmp4 = load <16 x i8>, <16 x i8>* %C 898 %tmp5 = add <16 x i8> %tmp3, %tmp4 899 ret <16 x i8> %tmp5 900} 901 902define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 903;CHECK-LABEL: uaba_4h: 904;CHECK: uaba.4h 905 %tmp1 = load <4 x i16>, <4 x i16>* %A 906 %tmp2 = load <4 x i16>, <4 x i16>* %B 907 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 908 %tmp4 = load <4 x i16>, <4 x i16>* %C 909 %tmp5 = add <4 x i16> %tmp3, %tmp4 910 ret <4 x i16> %tmp5 911} 912 913define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { 914;CHECK-LABEL: uaba_8h: 915;CHECK: uaba.8h 916 %tmp1 = load <8 x i16>, <8 x i16>* %A 917 %tmp2 = load <8 x i16>, <8 x i16>* %B 918 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 919 %tmp4 = load <8 x i16>, <8 x i16>* %C 920 %tmp5 = add <8 x i16> %tmp3, %tmp4 921 ret <8 x i16> %tmp5 922} 923 924define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 925;CHECK-LABEL: uaba_2s: 926;CHECK: uaba.2s 927 %tmp1 = load <2 x i32>, <2 x i32>* %A 928 %tmp2 = load <2 x i32>, <2 x i32>* %B 929 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 930 %tmp4 = load <2 x i32>, <2 x i32>* %C 931 %tmp5 = add <2 x i32> %tmp3, %tmp4 932 ret <2 x i32> %tmp5 933} 934 935define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { 936;CHECK-LABEL: uaba_4s: 937;CHECK: uaba.4s 938 %tmp1 = load <4 x i32>, <4 x i32>* %A 939 %tmp2 = load <4 x i32>, <4 x i32>* %B 940 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 941 %tmp4 = load <4 x i32>, <4 x i32>* %C 942 %tmp5 = add <4 x i32> %tmp3, %tmp4 943 ret <4 x i32> %tmp5 944} 945 946; Scalar FABD 947define float @fabds(float %a, float %b) nounwind { 948; CHECK-LABEL: fabds: 949; CHECK: fabd s0, s0, s1 950 %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind 951 ret float %vabd.i 952} 953 954define double @fabdd(double %a, double %b) nounwind { 955; CHECK-LABEL: fabdd: 956; CHECK: fabd d0, d0, d1 957 %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind 958 ret double %vabd.i 959} 960 961declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone 962declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone 963 964define float @fabds_from_fsub_fabs(float %a, float %b) nounwind { 965; CHECK-LABEL: fabds_from_fsub_fabs: 966; CHECK: fabd s0, s0, s1 967 %sub = fsub float %a, %b 968 %abs = tail call float @llvm.fabs.f32(float %sub) 969 ret float %abs 970} 971 972define double @fabdd_from_fsub_fabs(double %a, double %b) nounwind { 973; CHECK-LABEL: fabdd_from_fsub_fabs: 974; CHECK: fabd d0, d0, d1 975 %sub = fsub double %a, %b 976 %abs = tail call double @llvm.fabs.f64(double %sub) 977 ret double %abs 978} 979 980declare float @llvm.fabs.f32(float) nounwind readnone 981declare double @llvm.fabs.f64(double) nounwind readnone 982 983define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { 984; CHECK-LABEL: uabdl_from_extract_dup: 985; CHECK-NOT: ext.16b 986; CHECK: uabdl.2d 987 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 988 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 989 990 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 991 992 %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 993 %res1 = zext <2 x i32> %res to <2 x i64> 994 ret <2 x i64> %res1 995} 996 997define <2 x i64> @uabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { 998; CHECK-LABEL: uabdl2_from_extract_dup: 999; CHECK-NOT: ext.16b 1000; CHECK: uabdl2.2d 1001 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 1002 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 1003 1004 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1005 1006 %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 1007 %res1 = zext <2 x i32> %res to <2 x i64> 1008 ret <2 x i64> %res1 1009} 1010 1011define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { 1012; CHECK-LABEL: sabdl_from_extract_dup: 1013; CHECK-NOT: ext.16b 1014; CHECK: sabdl.2d 1015 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 1016 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 1017 1018 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 1019 1020 %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 1021 %res1 = zext <2 x i32> %res to <2 x i64> 1022 ret <2 x i64> %res1 1023} 1024 1025define <2 x i64> @sabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { 1026; CHECK-LABEL: sabdl2_from_extract_dup: 1027; CHECK-NOT: ext.16b 1028; CHECK: sabdl2.2d 1029 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 1030 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 1031 1032 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1033 1034 %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 1035 %res1 = zext <2 x i32> %res to <2 x i64> 1036 ret <2 x i64> %res1 1037} 1038 1039define <2 x i32> @abspattern1(<2 x i32> %a) nounwind { 1040; CHECK-LABEL: abspattern1: 1041; DAG: abs.2s 1042; DAG-NEXT: ret 1043 1044; GISEL-DAG: neg.2s 1045; GISEL-DAG: cmge.2s 1046; GISEL: bif.8b 1047 %tmp1neg = sub <2 x i32> zeroinitializer, %a 1048 %b = icmp sge <2 x i32> %a, zeroinitializer 1049 %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg 1050 ret <2 x i32> %abs 1051} 1052 1053define <4 x i16> @abspattern2(<4 x i16> %a) nounwind { 1054; CHECK-LABEL: abspattern2: 1055; DAG: abs.4h 1056; DAG-NEXT: ret 1057 1058; For GlobalISel, this generates terrible code until we can pattern match this to abs. 1059; GISEL-DAG: neg.4h 1060; GISEL-DAG: cmgt.4h 1061; GISEL: bif.8b 1062 %tmp1neg = sub <4 x i16> zeroinitializer, %a 1063 %b = icmp sgt <4 x i16> %a, zeroinitializer 1064 %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg 1065 ret <4 x i16> %abs 1066} 1067 1068define <8 x i8> @abspattern3(<8 x i8> %a) nounwind { 1069; CHECK-LABEL: abspattern3: 1070; DAG: abs.8b 1071; DAG-NEXT: ret 1072 1073; GISEL-DAG: neg.8b 1074; GISEL-DAG: cmgt.8b 1075; GISEL: bit.8b 1076 %tmp1neg = sub <8 x i8> zeroinitializer, %a 1077 %b = icmp slt <8 x i8> %a, zeroinitializer 1078 %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a 1079 ret <8 x i8> %abs 1080} 1081 1082define <4 x i32> @abspattern4(<4 x i32> %a) nounwind { 1083; CHECK-LABEL: abspattern4: 1084; DAG: abs.4s 1085; DAG-NEXT: ret 1086 1087; GISEL: cmge.4s 1088; GISEL: bif.16b 1089 %tmp1neg = sub <4 x i32> zeroinitializer, %a 1090 %b = icmp sge <4 x i32> %a, zeroinitializer 1091 %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg 1092 ret <4 x i32> %abs 1093} 1094 1095define <8 x i16> @abspattern5(<8 x i16> %a) nounwind { 1096; CHECK-LABEL: abspattern5: 1097; DAG: abs.8h 1098; DAG-NEXT: ret 1099 1100; GISEL-DAG: cmgt.8h 1101; GISEL-DAG: neg.8h 1102; GISEL: bif.16b 1103 %tmp1neg = sub <8 x i16> zeroinitializer, %a 1104 %b = icmp sgt <8 x i16> %a, zeroinitializer 1105 %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg 1106 ret <8 x i16> %abs 1107} 1108 1109define <16 x i8> @abspattern6(<16 x i8> %a) nounwind { 1110; CHECK-LABEL: abspattern6: 1111; DAG: abs.16b 1112; DAG-NEXT: ret 1113 1114; GISEL: cmgt.16b 1115; GISEL: bit.16b 1116 %tmp1neg = sub <16 x i8> zeroinitializer, %a 1117 %b = icmp slt <16 x i8> %a, zeroinitializer 1118 %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a 1119 ret <16 x i8> %abs 1120} 1121 1122define <2 x i64> @abspattern7(<2 x i64> %a) nounwind { 1123; CHECK-LABEL: abspattern7: 1124; DAG: abs.2d 1125; DAG-NEXT: ret 1126 1127; GISEL-DAG: neg.2d 1128; GISEL-DAG: cmge.2d 1129; GISEL: bit.16b 1130 %tmp1neg = sub <2 x i64> zeroinitializer, %a 1131 %b = icmp sle <2 x i64> %a, zeroinitializer 1132 %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a 1133 ret <2 x i64> %abs 1134} 1135