1; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s 2 3 4define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 5;CHECK-LABEL: sabdl8h: 6;CHECK: sabdl.8h 7 %tmp1 = load <8 x i8>, <8 x i8>* %A 8 %tmp2 = load <8 x i8>, <8 x i8>* %B 9 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 10 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 11 ret <8 x i16> %tmp4 12} 13 14define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 15;CHECK-LABEL: sabdl4s: 16;CHECK: sabdl.4s 17 %tmp1 = load <4 x i16>, <4 x i16>* %A 18 %tmp2 = load <4 x i16>, <4 x i16>* %B 19 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 20 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 21 ret <4 x i32> %tmp4 22} 23 24define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 25;CHECK-LABEL: sabdl2d: 26;CHECK: sabdl.2d 27 %tmp1 = load <2 x i32>, <2 x i32>* %A 28 %tmp2 = load <2 x i32>, <2 x i32>* %B 29 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 30 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 31 ret <2 x i64> %tmp4 32} 33 34define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { 35;CHECK-LABEL: sabdl2_8h: 36;CHECK: sabdl2.8h 37 %load1 = load <16 x i8>, <16 x i8>* %A 38 %load2 = load <16 x i8>, <16 x i8>* %B 39 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 40 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 41 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 42 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 43 ret <8 x i16> %tmp4 44} 45 46define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { 47;CHECK-LABEL: sabdl2_4s: 48;CHECK: sabdl2.4s 49 %load1 = load <8 x i16>, <8 x i16>* %A 50 %load2 = load <8 x i16>, <8 x i16>* %B 51 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 52 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 53 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 54 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 55 ret <4 x i32> %tmp4 56} 57 58define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { 59;CHECK-LABEL: sabdl2_2d: 60;CHECK: sabdl2.2d 61 %load1 = load <4 x i32>, <4 x i32>* %A 62 %load2 = load <4 x i32>, <4 x i32>* %B 63 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 64 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 65 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 66 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 67 ret <2 x i64> %tmp4 68} 69 70define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 71;CHECK-LABEL: uabdl8h: 72;CHECK: uabdl.8h 73 %tmp1 = load <8 x i8>, <8 x i8>* %A 74 %tmp2 = load <8 x i8>, <8 x i8>* %B 75 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 76 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 77 ret <8 x i16> %tmp4 78} 79 80define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 81;CHECK-LABEL: uabdl4s: 82;CHECK: uabdl.4s 83 %tmp1 = load <4 x i16>, <4 x i16>* %A 84 %tmp2 = load <4 x i16>, <4 x i16>* %B 85 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 86 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 87 ret <4 x i32> %tmp4 88} 89 90define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 91;CHECK-LABEL: uabdl2d: 92;CHECK: uabdl.2d 93 %tmp1 = load <2 x i32>, <2 x i32>* %A 94 %tmp2 = load <2 x i32>, <2 x i32>* %B 95 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 96 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 97 ret <2 x i64> %tmp4 98} 99 100define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { 101;CHECK-LABEL: uabdl2_8h: 102;CHECK: uabdl2.8h 103 %load1 = load <16 x i8>, <16 x i8>* %A 104 %load2 = load <16 x i8>, <16 x i8>* %B 105 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 106 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 107 108 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 109 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 110 ret <8 x i16> %tmp4 111} 112 113define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { 114;CHECK-LABEL: uabdl2_4s: 115;CHECK: uabdl2.4s 116 %load1 = load <8 x i16>, <8 x i16>* %A 117 %load2 = load <8 x i16>, <8 x i16>* %B 118 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 119 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 120 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 121 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 122 ret <4 x i32> %tmp4 123} 124 125define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { 126;CHECK-LABEL: uabdl2_2d: 127;CHECK: uabdl2.2d 128 %load1 = load <4 x i32>, <4 x i32>* %A 129 %load2 = load <4 x i32>, <4 x i32>* %B 130 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 131 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 132 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 133 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 134 ret <2 x i64> %tmp4 135} 136 137define i16 @uabdl8h_log2_shuffle(<16 x i8>* %a, <16 x i8>* %b) { 138; CHECK-LABEL: uabdl8h_log2_shuffle 139; CHECK: uabdl2.8h 140; CHECK: uabdl.8h 141 %aload = load <16 x i8>, <16 x i8>* %a, align 1 142 %bload = load <16 x i8>, <16 x i8>* %b, align 1 143 %aext = zext <16 x i8> %aload to <16 x i16> 144 %bext = zext <16 x i8> %bload to <16 x i16> 145 %abdiff = sub nsw <16 x i16> %aext, %bext 146 %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer 147 %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff 148 %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff 149 %rdx.shuf = shufflevector <16 x i16> %absel, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 150 %bin1.rdx = add <16 x i16> %absel, %rdx.shuf 151 %rdx.shufx = shufflevector <16 x i16> %bin1.rdx, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 152 %bin.rdx = add <16 x i16> %bin1.rdx, %rdx.shufx 153 %rdx.shuf136 = shufflevector <16 x i16> %bin.rdx, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 154 %bin.rdx137 = add <16 x i16> %bin.rdx, %rdx.shuf136 155 %rdx.shuf138 = shufflevector <16 x i16> %bin.rdx137, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 156 %bin.rdx139 = add <16 x i16> %bin.rdx137, %rdx.shuf138 157 %reduced_v = extractelement <16 x i16> %bin.rdx139, i16 0 158 ret i16 %reduced_v 159} 160 161define i32 @uabdl4s_log2_shuffle(<8 x i16>* %a, <8 x i16>* %b) { 162; CHECK-LABEL: uabdl4s_log2_shuffle 163; CHECK: uabdl2.4s 164; CHECK: uabdl.4s 165 %aload = load <8 x i16>, <8 x i16>* %a, align 1 166 %bload = load <8 x i16>, <8 x i16>* %b, align 1 167 %aext = zext <8 x i16> %aload to <8 x i32> 168 %bext = zext <8 x i16> %bload to <8 x i32> 169 %abdiff = sub nsw <8 x i32> %aext, %bext 170 %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer 171 %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff 172 %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff 173 %rdx.shuf = shufflevector <8 x i32> %absel, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 174 %bin.rdx = add <8 x i32> %absel, %rdx.shuf 175 %rdx.shuf136 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 176 %bin.rdx137 = add <8 x i32> %bin.rdx, %rdx.shuf136 177 %rdx.shuf138 = shufflevector <8 x i32> %bin.rdx137, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 178 %bin.rdx139 = add <8 x i32> %bin.rdx137, %rdx.shuf138 179 %reduced_v = extractelement <8 x i32> %bin.rdx139, i32 0 180 ret i32 %reduced_v 181} 182 183define i64 @uabdl2d_log2_shuffle(<4 x i32>* %a, <4 x i32>* %b, i32 %h) { 184; CHECK: uabdl2d_log2_shuffle 185; CHECK: uabdl2.2d 186; CHECK: uabdl.2d 187 %aload = load <4 x i32>, <4 x i32>* %a, align 1 188 %bload = load <4 x i32>, <4 x i32>* %b, align 1 189 %aext = zext <4 x i32> %aload to <4 x i64> 190 %bext = zext <4 x i32> %bload to <4 x i64> 191 %abdiff = sub nsw <4 x i64> %aext, %bext 192 %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer 193 %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff 194 %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff 195 %rdx.shuf136 = shufflevector <4 x i64> %absel, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 196 %bin.rdx137 = add <4 x i64> %absel, %rdx.shuf136 197 %rdx.shuf138 = shufflevector <4 x i64> %bin.rdx137, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 198 %bin.rdx139 = add <4 x i64> %bin.rdx137, %rdx.shuf138 199 %reduced_v = extractelement <4 x i64> %bin.rdx139, i16 0 200 ret i64 %reduced_v 201} 202 203define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind { 204;CHECK-LABEL: fabd_2s: 205;CHECK: fabd.2s 206 %tmp1 = load <2 x float>, <2 x float>* %A 207 %tmp2 = load <2 x float>, <2 x float>* %B 208 %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) 209 ret <2 x float> %tmp3 210} 211 212define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind { 213;CHECK-LABEL: fabd_4s: 214;CHECK: fabd.4s 215 %tmp1 = load <4 x float>, <4 x float>* %A 216 %tmp2 = load <4 x float>, <4 x float>* %B 217 %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) 218 ret <4 x float> %tmp3 219} 220 221define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind { 222;CHECK-LABEL: fabd_2d: 223;CHECK: fabd.2d 224 %tmp1 = load <2 x double>, <2 x double>* %A 225 %tmp2 = load <2 x double>, <2 x double>* %B 226 %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2) 227 ret <2 x double> %tmp3 228} 229 230declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone 231declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone 232declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone 233 234define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { 235;CHECK-LABEL: sabd_8b: 236;CHECK: sabd.8b 237 %tmp1 = load <8 x i8>, <8 x i8>* %A 238 %tmp2 = load <8 x i8>, <8 x i8>* %B 239 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 240 ret <8 x i8> %tmp3 241} 242 243define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { 244;CHECK-LABEL: sabd_16b: 245;CHECK: sabd.16b 246 %tmp1 = load <16 x i8>, <16 x i8>* %A 247 %tmp2 = load <16 x i8>, <16 x i8>* %B 248 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 249 ret <16 x i8> %tmp3 250} 251 252define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 253;CHECK-LABEL: sabd_4h: 254;CHECK: sabd.4h 255 %tmp1 = load <4 x i16>, <4 x i16>* %A 256 %tmp2 = load <4 x i16>, <4 x i16>* %B 257 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 258 ret <4 x i16> %tmp3 259} 260 261define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 262;CHECK-LABEL: sabd_8h: 263;CHECK: sabd.8h 264 %tmp1 = load <8 x i16>, <8 x i16>* %A 265 %tmp2 = load <8 x i16>, <8 x i16>* %B 266 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 267 ret <8 x i16> %tmp3 268} 269 270define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 271;CHECK-LABEL: sabd_2s: 272;CHECK: sabd.2s 273 %tmp1 = load <2 x i32>, <2 x i32>* %A 274 %tmp2 = load <2 x i32>, <2 x i32>* %B 275 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 276 ret <2 x i32> %tmp3 277} 278 279define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 280;CHECK-LABEL: sabd_4s: 281;CHECK: sabd.4s 282 %tmp1 = load <4 x i32>, <4 x i32>* %A 283 %tmp2 = load <4 x i32>, <4 x i32>* %B 284 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 285 ret <4 x i32> %tmp3 286} 287 288declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 289declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 290declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 291declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 292declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 293declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 294 295define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { 296;CHECK-LABEL: uabd_8b: 297;CHECK: uabd.8b 298 %tmp1 = load <8 x i8>, <8 x i8>* %A 299 %tmp2 = load <8 x i8>, <8 x i8>* %B 300 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 301 ret <8 x i8> %tmp3 302} 303 304define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { 305;CHECK-LABEL: uabd_16b: 306;CHECK: uabd.16b 307 %tmp1 = load <16 x i8>, <16 x i8>* %A 308 %tmp2 = load <16 x i8>, <16 x i8>* %B 309 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 310 ret <16 x i8> %tmp3 311} 312 313define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 314;CHECK-LABEL: uabd_4h: 315;CHECK: uabd.4h 316 %tmp1 = load <4 x i16>, <4 x i16>* %A 317 %tmp2 = load <4 x i16>, <4 x i16>* %B 318 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 319 ret <4 x i16> %tmp3 320} 321 322define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 323;CHECK-LABEL: uabd_8h: 324;CHECK: uabd.8h 325 %tmp1 = load <8 x i16>, <8 x i16>* %A 326 %tmp2 = load <8 x i16>, <8 x i16>* %B 327 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 328 ret <8 x i16> %tmp3 329} 330 331define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 332;CHECK-LABEL: uabd_2s: 333;CHECK: uabd.2s 334 %tmp1 = load <2 x i32>, <2 x i32>* %A 335 %tmp2 = load <2 x i32>, <2 x i32>* %B 336 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 337 ret <2 x i32> %tmp3 338} 339 340define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 341;CHECK-LABEL: uabd_4s: 342;CHECK: uabd.4s 343 %tmp1 = load <4 x i32>, <4 x i32>* %A 344 %tmp2 = load <4 x i32>, <4 x i32>* %B 345 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 346 ret <4 x i32> %tmp3 347} 348 349declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 350declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 351declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 352declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 353declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 354declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 355 356define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind { 357;CHECK-LABEL: sqabs_8b: 358;CHECK: sqabs.8b 359 %tmp1 = load <8 x i8>, <8 x i8>* %A 360 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1) 361 ret <8 x i8> %tmp3 362} 363 364define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind { 365;CHECK-LABEL: sqabs_16b: 366;CHECK: sqabs.16b 367 %tmp1 = load <16 x i8>, <16 x i8>* %A 368 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1) 369 ret <16 x i8> %tmp3 370} 371 372define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind { 373;CHECK-LABEL: sqabs_4h: 374;CHECK: sqabs.4h 375 %tmp1 = load <4 x i16>, <4 x i16>* %A 376 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1) 377 ret <4 x i16> %tmp3 378} 379 380define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind { 381;CHECK-LABEL: sqabs_8h: 382;CHECK: sqabs.8h 383 %tmp1 = load <8 x i16>, <8 x i16>* %A 384 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1) 385 ret <8 x i16> %tmp3 386} 387 388define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind { 389;CHECK-LABEL: sqabs_2s: 390;CHECK: sqabs.2s 391 %tmp1 = load <2 x i32>, <2 x i32>* %A 392 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1) 393 ret <2 x i32> %tmp3 394} 395 396define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind { 397;CHECK-LABEL: sqabs_4s: 398;CHECK: sqabs.4s 399 %tmp1 = load <4 x i32>, <4 x i32>* %A 400 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1) 401 ret <4 x i32> %tmp3 402} 403 404declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone 405declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone 406declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone 407declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone 408declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone 409declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone 410 411define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind { 412;CHECK-LABEL: sqneg_8b: 413;CHECK: sqneg.8b 414 %tmp1 = load <8 x i8>, <8 x i8>* %A 415 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1) 416 ret <8 x i8> %tmp3 417} 418 419define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind { 420;CHECK-LABEL: sqneg_16b: 421;CHECK: sqneg.16b 422 %tmp1 = load <16 x i8>, <16 x i8>* %A 423 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1) 424 ret <16 x i8> %tmp3 425} 426 427define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind { 428;CHECK-LABEL: sqneg_4h: 429;CHECK: sqneg.4h 430 %tmp1 = load <4 x i16>, <4 x i16>* %A 431 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1) 432 ret <4 x i16> %tmp3 433} 434 435define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind { 436;CHECK-LABEL: sqneg_8h: 437;CHECK: sqneg.8h 438 %tmp1 = load <8 x i16>, <8 x i16>* %A 439 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1) 440 ret <8 x i16> %tmp3 441} 442 443define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind { 444;CHECK-LABEL: sqneg_2s: 445;CHECK: sqneg.2s 446 %tmp1 = load <2 x i32>, <2 x i32>* %A 447 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1) 448 ret <2 x i32> %tmp3 449} 450 451define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind { 452;CHECK-LABEL: sqneg_4s: 453;CHECK: sqneg.4s 454 %tmp1 = load <4 x i32>, <4 x i32>* %A 455 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1) 456 ret <4 x i32> %tmp3 457} 458 459declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone 460declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone 461declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone 462declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone 463declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone 464declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone 465 466define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind { 467;CHECK-LABEL: abs_8b: 468;CHECK: abs.8b 469 %tmp1 = load <8 x i8>, <8 x i8>* %A 470 %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1) 471 ret <8 x i8> %tmp3 472} 473 474define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind { 475;CHECK-LABEL: abs_16b: 476;CHECK: abs.16b 477 %tmp1 = load <16 x i8>, <16 x i8>* %A 478 %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1) 479 ret <16 x i8> %tmp3 480} 481 482define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind { 483;CHECK-LABEL: abs_4h: 484;CHECK: abs.4h 485 %tmp1 = load <4 x i16>, <4 x i16>* %A 486 %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1) 487 ret <4 x i16> %tmp3 488} 489 490define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind { 491;CHECK-LABEL: abs_8h: 492;CHECK: abs.8h 493 %tmp1 = load <8 x i16>, <8 x i16>* %A 494 %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1) 495 ret <8 x i16> %tmp3 496} 497 498define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind { 499;CHECK-LABEL: abs_2s: 500;CHECK: abs.2s 501 %tmp1 = load <2 x i32>, <2 x i32>* %A 502 %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1) 503 ret <2 x i32> %tmp3 504} 505 506define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind { 507;CHECK-LABEL: abs_4s: 508;CHECK: abs.4s 509 %tmp1 = load <4 x i32>, <4 x i32>* %A 510 %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1) 511 ret <4 x i32> %tmp3 512} 513 514define <1 x i64> @abs_1d(<1 x i64> %A) nounwind { 515; CHECK-LABEL: abs_1d: 516; CHECK: abs d0, d0 517 %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A) 518 ret <1 x i64> %abs 519} 520 521define i64 @abs_1d_honestly(i64 %A) nounwind { 522; CHECK-LABEL: abs_1d_honestly: 523; CHECK: abs d0, d0 524 %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A) 525 ret i64 %abs 526} 527 528declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone 529declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone 530declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone 531declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone 532declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone 533declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone 534declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone 535declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone 536 537define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind { 538;CHECK-LABEL: sabal8h: 539;CHECK: sabal.8h 540 %tmp1 = load <8 x i8>, <8 x i8>* %A 541 %tmp2 = load <8 x i8>, <8 x i8>* %B 542 %tmp3 = load <8 x i16>, <8 x i16>* %C 543 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 544 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 545 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 546 ret <8 x i16> %tmp5 547} 548 549define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 550;CHECK-LABEL: sabal4s: 551;CHECK: sabal.4s 552 %tmp1 = load <4 x i16>, <4 x i16>* %A 553 %tmp2 = load <4 x i16>, <4 x i16>* %B 554 %tmp3 = load <4 x i32>, <4 x i32>* %C 555 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 556 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 557 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 558 ret <4 x i32> %tmp5 559} 560 561define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 562;CHECK-LABEL: sabal2d: 563;CHECK: sabal.2d 564 %tmp1 = load <2 x i32>, <2 x i32>* %A 565 %tmp2 = load <2 x i32>, <2 x i32>* %B 566 %tmp3 = load <2 x i64>, <2 x i64>* %C 567 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 568 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 569 %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64> 570 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 571 ret <2 x i64> %tmp5 572} 573 574define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind { 575;CHECK-LABEL: sabal2_8h: 576;CHECK: sabal2.8h 577 %load1 = load <16 x i8>, <16 x i8>* %A 578 %load2 = load <16 x i8>, <16 x i8>* %B 579 %tmp3 = load <8 x i16>, <8 x i16>* %C 580 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 581 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 582 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 583 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 584 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 585 ret <8 x i16> %tmp5 586} 587 588define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { 589;CHECK-LABEL: sabal2_4s: 590;CHECK: sabal2.4s 591 %load1 = load <8 x i16>, <8 x i16>* %A 592 %load2 = load <8 x i16>, <8 x i16>* %B 593 %tmp3 = load <4 x i32>, <4 x i32>* %C 594 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 595 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 596 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 597 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 598 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 599 ret <4 x i32> %tmp5 600} 601 602define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { 603;CHECK-LABEL: sabal2_2d: 604;CHECK: sabal2.2d 605 %load1 = load <4 x i32>, <4 x i32>* %A 606 %load2 = load <4 x i32>, <4 x i32>* %B 607 %tmp3 = load <2 x i64>, <2 x i64>* %C 608 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 609 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 610 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 611 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 612 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 613 ret <2 x i64> %tmp5 614} 615 616define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind { 617;CHECK-LABEL: uabal8h: 618;CHECK: uabal.8h 619 %tmp1 = load <8 x i8>, <8 x i8>* %A 620 %tmp2 = load <8 x i8>, <8 x i8>* %B 621 %tmp3 = load <8 x i16>, <8 x i16>* %C 622 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 623 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 624 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 625 ret <8 x i16> %tmp5 626} 627 628define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 629;CHECK-LABEL: uabal4s: 630;CHECK: uabal.4s 631 %tmp1 = load <4 x i16>, <4 x i16>* %A 632 %tmp2 = load <4 x i16>, <4 x i16>* %B 633 %tmp3 = load <4 x i32>, <4 x i32>* %C 634 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 635 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 636 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 637 ret <4 x i32> %tmp5 638} 639 640define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 641;CHECK-LABEL: uabal2d: 642;CHECK: uabal.2d 643 %tmp1 = load <2 x i32>, <2 x i32>* %A 644 %tmp2 = load <2 x i32>, <2 x i32>* %B 645 %tmp3 = load <2 x i64>, <2 x i64>* %C 646 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 647 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 648 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 649 ret <2 x i64> %tmp5 650} 651 652define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind { 653;CHECK-LABEL: uabal2_8h: 654;CHECK: uabal2.8h 655 %load1 = load <16 x i8>, <16 x i8>* %A 656 %load2 = load <16 x i8>, <16 x i8>* %B 657 %tmp3 = load <8 x i16>, <8 x i16>* %C 658 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 659 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 660 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 661 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 662 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 663 ret <8 x i16> %tmp5 664} 665 666define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { 667;CHECK-LABEL: uabal2_4s: 668;CHECK: uabal2.4s 669 %load1 = load <8 x i16>, <8 x i16>* %A 670 %load2 = load <8 x i16>, <8 x i16>* %B 671 %tmp3 = load <4 x i32>, <4 x i32>* %C 672 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 673 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 674 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 675 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 676 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 677 ret <4 x i32> %tmp5 678} 679 680define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { 681;CHECK-LABEL: uabal2_2d: 682;CHECK: uabal2.2d 683 %load1 = load <4 x i32>, <4 x i32>* %A 684 %load2 = load <4 x i32>, <4 x i32>* %B 685 %tmp3 = load <2 x i64>, <2 x i64>* %C 686 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 687 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 688 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 689 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 690 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 691 ret <2 x i64> %tmp5 692} 693 694define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { 695;CHECK-LABEL: saba_8b: 696;CHECK: saba.8b 697 %tmp1 = load <8 x i8>, <8 x i8>* %A 698 %tmp2 = load <8 x i8>, <8 x i8>* %B 699 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 700 %tmp4 = load <8 x i8>, <8 x i8>* %C 701 %tmp5 = add <8 x i8> %tmp3, %tmp4 702 ret <8 x i8> %tmp5 703} 704 705define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { 706;CHECK-LABEL: saba_16b: 707;CHECK: saba.16b 708 %tmp1 = load <16 x i8>, <16 x i8>* %A 709 %tmp2 = load <16 x i8>, <16 x i8>* %B 710 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 711 %tmp4 = load <16 x i8>, <16 x i8>* %C 712 %tmp5 = add <16 x i8> %tmp3, %tmp4 713 ret <16 x i8> %tmp5 714} 715 716define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 717;CHECK-LABEL: saba_4h: 718;CHECK: saba.4h 719 %tmp1 = load <4 x i16>, <4 x i16>* %A 720 %tmp2 = load <4 x i16>, <4 x i16>* %B 721 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 722 %tmp4 = load <4 x i16>, <4 x i16>* %C 723 %tmp5 = add <4 x i16> %tmp3, %tmp4 724 ret <4 x i16> %tmp5 725} 726 727define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { 728;CHECK-LABEL: saba_8h: 729;CHECK: saba.8h 730 %tmp1 = load <8 x i16>, <8 x i16>* %A 731 %tmp2 = load <8 x i16>, <8 x i16>* %B 732 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 733 %tmp4 = load <8 x i16>, <8 x i16>* %C 734 %tmp5 = add <8 x i16> %tmp3, %tmp4 735 ret <8 x i16> %tmp5 736} 737 738define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 739;CHECK-LABEL: saba_2s: 740;CHECK: saba.2s 741 %tmp1 = load <2 x i32>, <2 x i32>* %A 742 %tmp2 = load <2 x i32>, <2 x i32>* %B 743 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 744 %tmp4 = load <2 x i32>, <2 x i32>* %C 745 %tmp5 = add <2 x i32> %tmp3, %tmp4 746 ret <2 x i32> %tmp5 747} 748 749define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { 750;CHECK-LABEL: saba_4s: 751;CHECK: saba.4s 752 %tmp1 = load <4 x i32>, <4 x i32>* %A 753 %tmp2 = load <4 x i32>, <4 x i32>* %B 754 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 755 %tmp4 = load <4 x i32>, <4 x i32>* %C 756 %tmp5 = add <4 x i32> %tmp3, %tmp4 757 ret <4 x i32> %tmp5 758} 759 760define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { 761;CHECK-LABEL: uaba_8b: 762;CHECK: uaba.8b 763 %tmp1 = load <8 x i8>, <8 x i8>* %A 764 %tmp2 = load <8 x i8>, <8 x i8>* %B 765 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 766 %tmp4 = load <8 x i8>, <8 x i8>* %C 767 %tmp5 = add <8 x i8> %tmp3, %tmp4 768 ret <8 x i8> %tmp5 769} 770 771define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { 772;CHECK-LABEL: uaba_16b: 773;CHECK: uaba.16b 774 %tmp1 = load <16 x i8>, <16 x i8>* %A 775 %tmp2 = load <16 x i8>, <16 x i8>* %B 776 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 777 %tmp4 = load <16 x i8>, <16 x i8>* %C 778 %tmp5 = add <16 x i8> %tmp3, %tmp4 779 ret <16 x i8> %tmp5 780} 781 782define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 783;CHECK-LABEL: uaba_4h: 784;CHECK: uaba.4h 785 %tmp1 = load <4 x i16>, <4 x i16>* %A 786 %tmp2 = load <4 x i16>, <4 x i16>* %B 787 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 788 %tmp4 = load <4 x i16>, <4 x i16>* %C 789 %tmp5 = add <4 x i16> %tmp3, %tmp4 790 ret <4 x i16> %tmp5 791} 792 793define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { 794;CHECK-LABEL: uaba_8h: 795;CHECK: uaba.8h 796 %tmp1 = load <8 x i16>, <8 x i16>* %A 797 %tmp2 = load <8 x i16>, <8 x i16>* %B 798 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 799 %tmp4 = load <8 x i16>, <8 x i16>* %C 800 %tmp5 = add <8 x i16> %tmp3, %tmp4 801 ret <8 x i16> %tmp5 802} 803 804define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 805;CHECK-LABEL: uaba_2s: 806;CHECK: uaba.2s 807 %tmp1 = load <2 x i32>, <2 x i32>* %A 808 %tmp2 = load <2 x i32>, <2 x i32>* %B 809 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 810 %tmp4 = load <2 x i32>, <2 x i32>* %C 811 %tmp5 = add <2 x i32> %tmp3, %tmp4 812 ret <2 x i32> %tmp5 813} 814 815define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { 816;CHECK-LABEL: uaba_4s: 817;CHECK: uaba.4s 818 %tmp1 = load <4 x i32>, <4 x i32>* %A 819 %tmp2 = load <4 x i32>, <4 x i32>* %B 820 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 821 %tmp4 = load <4 x i32>, <4 x i32>* %C 822 %tmp5 = add <4 x i32> %tmp3, %tmp4 823 ret <4 x i32> %tmp5 824} 825 826; Scalar FABD 827define float @fabds(float %a, float %b) nounwind { 828; CHECK-LABEL: fabds: 829; CHECK: fabd s0, s0, s1 830 %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind 831 ret float %vabd.i 832} 833 834define double @fabdd(double %a, double %b) nounwind { 835; CHECK-LABEL: fabdd: 836; CHECK: fabd d0, d0, d1 837 %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind 838 ret double %vabd.i 839} 840 841declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone 842declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone 843 844define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { 845; CHECK-LABEL: uabdl_from_extract_dup: 846; CHECK-NOT: ext.16b 847; CHECK: uabdl2.2d 848 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 849 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 850 851 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 852 853 %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 854 %res1 = zext <2 x i32> %res to <2 x i64> 855 ret <2 x i64> %res1 856} 857 858define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { 859; CHECK-LABEL: sabdl_from_extract_dup: 860; CHECK-NOT: ext.16b 861; CHECK: sabdl2.2d 862 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 863 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 864 865 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 866 867 %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 868 %res1 = zext <2 x i32> %res to <2 x i64> 869 ret <2 x i64> %res1 870} 871 872define <2 x i32> @abspattern1(<2 x i32> %a) nounwind { 873; CHECK-LABEL: abspattern1: 874; CHECK: abs.2s 875; CHECK-NEXT: ret 876 %tmp1neg = sub <2 x i32> zeroinitializer, %a 877 %b = icmp sge <2 x i32> %a, zeroinitializer 878 %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg 879 ret <2 x i32> %abs 880} 881 882define <4 x i16> @abspattern2(<4 x i16> %a) nounwind { 883; CHECK-LABEL: abspattern2: 884; CHECK: abs.4h 885; CHECK-NEXT: ret 886 %tmp1neg = sub <4 x i16> zeroinitializer, %a 887 %b = icmp sgt <4 x i16> %a, zeroinitializer 888 %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg 889 ret <4 x i16> %abs 890} 891 892define <8 x i8> @abspattern3(<8 x i8> %a) nounwind { 893; CHECK-LABEL: abspattern3: 894; CHECK: abs.8b 895; CHECK-NEXT: ret 896 %tmp1neg = sub <8 x i8> zeroinitializer, %a 897 %b = icmp slt <8 x i8> %a, zeroinitializer 898 %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a 899 ret <8 x i8> %abs 900} 901 902define <4 x i32> @abspattern4(<4 x i32> %a) nounwind { 903; CHECK-LABEL: abspattern4: 904; CHECK: abs.4s 905; CHECK-NEXT: ret 906 %tmp1neg = sub <4 x i32> zeroinitializer, %a 907 %b = icmp sge <4 x i32> %a, zeroinitializer 908 %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg 909 ret <4 x i32> %abs 910} 911 912define <8 x i16> @abspattern5(<8 x i16> %a) nounwind { 913; CHECK-LABEL: abspattern5: 914; CHECK: abs.8h 915; CHECK-NEXT: ret 916 %tmp1neg = sub <8 x i16> zeroinitializer, %a 917 %b = icmp sgt <8 x i16> %a, zeroinitializer 918 %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg 919 ret <8 x i16> %abs 920} 921 922define <16 x i8> @abspattern6(<16 x i8> %a) nounwind { 923; CHECK-LABEL: abspattern6: 924; CHECK: abs.16b 925; CHECK-NEXT: ret 926 %tmp1neg = sub <16 x i8> zeroinitializer, %a 927 %b = icmp slt <16 x i8> %a, zeroinitializer 928 %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a 929 ret <16 x i8> %abs 930} 931 932define <2 x i64> @abspattern7(<2 x i64> %a) nounwind { 933; CHECK-LABEL: abspattern7: 934; CHECK: abs.2d 935; CHECK-NEXT: ret 936 %tmp1neg = sub <2 x i64> zeroinitializer, %a 937 %b = icmp sle <2 x i64> %a, zeroinitializer 938 %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a 939 ret <2 x i64> %abs 940} 941