1; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s 2 3define <8 x i8> @addhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind { 4;CHECK-LABEL: addhn8b: 5;CHECK: addhn.8b 6 %tmp1 = load <8 x i16>, <8 x i16>* %A 7 %tmp2 = load <8 x i16>, <8 x i16>* %B 8 %tmp3 = call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) 9 ret <8 x i8> %tmp3 10} 11 12define <4 x i16> @addhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind { 13;CHECK-LABEL: addhn4h: 14;CHECK: addhn.4h 15 %tmp1 = load <4 x i32>, <4 x i32>* %A 16 %tmp2 = load <4 x i32>, <4 x i32>* %B 17 %tmp3 = call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) 18 ret <4 x i16> %tmp3 19} 20 21define <2 x i32> @addhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind { 22;CHECK-LABEL: addhn2s: 23;CHECK: addhn.2s 24 %tmp1 = load <2 x i64>, <2 x i64>* %A 25 %tmp2 = load <2 x i64>, <2 x i64>* %B 26 %tmp3 = call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) 27 ret <2 x i32> %tmp3 28} 29 30define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind { 31;CHECK-LABEL: addhn2_16b: 32;CHECK: addhn.8b 33;CHECK-NEXT: addhn2.16b 34 %vaddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind 35 %vaddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind 36 %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 37 ret <16 x i8> %res 38} 39 40define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind { 41;CHECK-LABEL: addhn2_8h: 42;CHECK: addhn.4h 43;CHECK-NEXT: addhn2.8h 44 %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind 45 %vaddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind 46 %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 47 ret <8 x i16> %res 48} 49 50define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind { 51;CHECK-LABEL: addhn2_4s: 52;CHECK: addhn.2s 53;CHECK-NEXT: addhn2.4s 54 %vaddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind 55 %vaddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind 56 %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 57 ret <4 x i32> %res 58} 59 60declare <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone 61declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone 62declare <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone 63 64 65define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind { 66;CHECK-LABEL: raddhn8b: 67;CHECK: raddhn.8b 68 %tmp1 = load <8 x i16>, <8 x i16>* %A 69 %tmp2 = load <8 x i16>, <8 x i16>* %B 70 %tmp3 = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) 71 ret <8 x i8> %tmp3 72} 73 74define <4 x i16> @raddhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind { 75;CHECK-LABEL: raddhn4h: 76;CHECK: raddhn.4h 77 %tmp1 = load <4 x i32>, <4 x i32>* %A 78 %tmp2 = load <4 x i32>, <4 x i32>* %B 79 %tmp3 = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) 80 ret <4 x i16> %tmp3 81} 82 83define <2 x i32> @raddhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind { 84;CHECK-LABEL: raddhn2s: 85;CHECK: raddhn.2s 86 %tmp1 = load <2 x i64>, <2 x i64>* %A 87 %tmp2 = load <2 x i64>, <2 x i64>* %B 88 %tmp3 = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) 89 ret <2 x i32> %tmp3 90} 91 92define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind { 93;CHECK-LABEL: raddhn2_16b: 94;CHECK: raddhn.8b 95;CHECK-NEXT: raddhn2.16b 96 %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind 97 %vraddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind 98 %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 99 ret <16 x i8> %res 100} 101 102define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind { 103;CHECK-LABEL: raddhn2_8h: 104;CHECK: raddhn.4h 105;CHECK-NEXT: raddhn2.8h 106 %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind 107 %vraddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind 108 %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 109 ret <8 x i16> %res 110} 111 112define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind { 113;CHECK-LABEL: raddhn2_4s: 114;CHECK: raddhn.2s 115;CHECK-NEXT: raddhn2.4s 116 %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind 117 %vraddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind 118 %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 119 ret <4 x i32> %res 120} 121 122declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone 123declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone 124declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone 125 126define <8 x i16> @saddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 127;CHECK-LABEL: saddl8h: 128;CHECK: saddl.8h 129 %tmp1 = load <8 x i8>, <8 x i8>* %A 130 %tmp2 = load <8 x i8>, <8 x i8>* %B 131 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16> 132 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16> 133 %tmp5 = add <8 x i16> %tmp3, %tmp4 134 ret <8 x i16> %tmp5 135} 136 137define <4 x i32> @saddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 138;CHECK-LABEL: saddl4s: 139;CHECK: saddl.4s 140 %tmp1 = load <4 x i16>, <4 x i16>* %A 141 %tmp2 = load <4 x i16>, <4 x i16>* %B 142 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32> 143 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32> 144 %tmp5 = add <4 x i32> %tmp3, %tmp4 145 ret <4 x i32> %tmp5 146} 147 148define <2 x i64> @saddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 149;CHECK-LABEL: saddl2d: 150;CHECK: saddl.2d 151 %tmp1 = load <2 x i32>, <2 x i32>* %A 152 %tmp2 = load <2 x i32>, <2 x i32>* %B 153 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64> 154 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64> 155 %tmp5 = add <2 x i64> %tmp3, %tmp4 156 ret <2 x i64> %tmp5 157} 158 159define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind { 160; CHECK-LABEL: saddl2_8h: 161; CHECK-NEXT: saddl2.8h v0, v0, v1 162; CHECK-NEXT: ret 163 %tmp = bitcast <16 x i8> %a to <2 x i64> 164 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 165 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8> 166 %vmovl.i.i.i = sext <8 x i8> %tmp1 to <8 x i16> 167 %tmp2 = bitcast <16 x i8> %b to <2 x i64> 168 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 169 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8> 170 %vmovl.i.i5.i = sext <8 x i8> %tmp3 to <8 x i16> 171 %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i 172 ret <8 x i16> %add.i 173} 174 175define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind { 176; CHECK-LABEL: saddl2_4s: 177; CHECK-NEXT: saddl2.4s v0, v0, v1 178; CHECK-NEXT: ret 179 %tmp = bitcast <8 x i16> %a to <2 x i64> 180 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 181 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16> 182 %vmovl.i.i.i = sext <4 x i16> %tmp1 to <4 x i32> 183 %tmp2 = bitcast <8 x i16> %b to <2 x i64> 184 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 185 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16> 186 %vmovl.i.i5.i = sext <4 x i16> %tmp3 to <4 x i32> 187 %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i 188 ret <4 x i32> %add.i 189} 190 191define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind { 192; CHECK-LABEL: saddl2_2d: 193; CHECK-NEXT: saddl2.2d v0, v0, v1 194; CHECK-NEXT: ret 195 %tmp = bitcast <4 x i32> %a to <2 x i64> 196 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 197 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32> 198 %vmovl.i.i.i = sext <2 x i32> %tmp1 to <2 x i64> 199 %tmp2 = bitcast <4 x i32> %b to <2 x i64> 200 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 201 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32> 202 %vmovl.i.i5.i = sext <2 x i32> %tmp3 to <2 x i64> 203 %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i 204 ret <2 x i64> %add.i 205} 206 207define <8 x i16> @uaddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 208;CHECK-LABEL: uaddl8h: 209;CHECK: uaddl.8h 210 %tmp1 = load <8 x i8>, <8 x i8>* %A 211 %tmp2 = load <8 x i8>, <8 x i8>* %B 212 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16> 213 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> 214 %tmp5 = add <8 x i16> %tmp3, %tmp4 215 ret <8 x i16> %tmp5 216} 217 218define <4 x i32> @uaddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 219;CHECK-LABEL: uaddl4s: 220;CHECK: uaddl.4s 221 %tmp1 = load <4 x i16>, <4 x i16>* %A 222 %tmp2 = load <4 x i16>, <4 x i16>* %B 223 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32> 224 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> 225 %tmp5 = add <4 x i32> %tmp3, %tmp4 226 ret <4 x i32> %tmp5 227} 228 229define <2 x i64> @uaddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 230;CHECK-LABEL: uaddl2d: 231;CHECK: uaddl.2d 232 %tmp1 = load <2 x i32>, <2 x i32>* %A 233 %tmp2 = load <2 x i32>, <2 x i32>* %B 234 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64> 235 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> 236 %tmp5 = add <2 x i64> %tmp3, %tmp4 237 ret <2 x i64> %tmp5 238} 239 240 241define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind { 242; CHECK-LABEL: uaddl2_8h: 243; CHECK-NEXT: uaddl2.8h v0, v0, v1 244; CHECK-NEXT: ret 245 %tmp = bitcast <16 x i8> %a to <2 x i64> 246 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 247 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8> 248 %vmovl.i.i.i = zext <8 x i8> %tmp1 to <8 x i16> 249 %tmp2 = bitcast <16 x i8> %b to <2 x i64> 250 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 251 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8> 252 %vmovl.i.i5.i = zext <8 x i8> %tmp3 to <8 x i16> 253 %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i 254 ret <8 x i16> %add.i 255} 256 257define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind { 258; CHECK-LABEL: uaddl2_4s: 259; CHECK-NEXT: uaddl2.4s v0, v0, v1 260; CHECK-NEXT: ret 261 %tmp = bitcast <8 x i16> %a to <2 x i64> 262 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 263 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16> 264 %vmovl.i.i.i = zext <4 x i16> %tmp1 to <4 x i32> 265 %tmp2 = bitcast <8 x i16> %b to <2 x i64> 266 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 267 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16> 268 %vmovl.i.i5.i = zext <4 x i16> %tmp3 to <4 x i32> 269 %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i 270 ret <4 x i32> %add.i 271} 272 273define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind { 274; CHECK-LABEL: uaddl2_2d: 275; CHECK-NEXT: uaddl2.2d v0, v0, v1 276; CHECK-NEXT: ret 277 %tmp = bitcast <4 x i32> %a to <2 x i64> 278 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 279 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32> 280 %vmovl.i.i.i = zext <2 x i32> %tmp1 to <2 x i64> 281 %tmp2 = bitcast <4 x i32> %b to <2 x i64> 282 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 283 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32> 284 %vmovl.i.i5.i = zext <2 x i32> %tmp3 to <2 x i64> 285 %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i 286 ret <2 x i64> %add.i 287} 288 289define <8 x i16> @uaddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind { 290;CHECK-LABEL: uaddw8h: 291;CHECK: uaddw.8h 292 %tmp1 = load <8 x i16>, <8 x i16>* %A 293 %tmp2 = load <8 x i8>, <8 x i8>* %B 294 %tmp3 = zext <8 x i8> %tmp2 to <8 x i16> 295 %tmp4 = add <8 x i16> %tmp1, %tmp3 296 ret <8 x i16> %tmp4 297} 298 299define <4 x i32> @uaddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind { 300;CHECK-LABEL: uaddw4s: 301;CHECK: uaddw.4s 302 %tmp1 = load <4 x i32>, <4 x i32>* %A 303 %tmp2 = load <4 x i16>, <4 x i16>* %B 304 %tmp3 = zext <4 x i16> %tmp2 to <4 x i32> 305 %tmp4 = add <4 x i32> %tmp1, %tmp3 306 ret <4 x i32> %tmp4 307} 308 309define <2 x i64> @uaddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind { 310;CHECK-LABEL: uaddw2d: 311;CHECK: uaddw.2d 312 %tmp1 = load <2 x i64>, <2 x i64>* %A 313 %tmp2 = load <2 x i32>, <2 x i32>* %B 314 %tmp3 = zext <2 x i32> %tmp2 to <2 x i64> 315 %tmp4 = add <2 x i64> %tmp1, %tmp3 316 ret <2 x i64> %tmp4 317} 318 319define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind { 320;CHECK-LABEL: uaddw2_8h: 321;CHECK: uaddw2.8h 322 %tmp1 = load <8 x i16>, <8 x i16>* %A 323 324 %tmp2 = load <16 x i8>, <16 x i8>* %B 325 %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 326 %ext2 = zext <8 x i8> %high2 to <8 x i16> 327 328 %res = add <8 x i16> %tmp1, %ext2 329 ret <8 x i16> %res 330} 331 332define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind { 333;CHECK-LABEL: uaddw2_4s: 334;CHECK: uaddw2.4s 335 %tmp1 = load <4 x i32>, <4 x i32>* %A 336 337 %tmp2 = load <8 x i16>, <8 x i16>* %B 338 %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 339 %ext2 = zext <4 x i16> %high2 to <4 x i32> 340 341 %res = add <4 x i32> %tmp1, %ext2 342 ret <4 x i32> %res 343} 344 345define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind { 346;CHECK-LABEL: uaddw2_2d: 347;CHECK: uaddw2.2d 348 %tmp1 = load <2 x i64>, <2 x i64>* %A 349 350 %tmp2 = load <4 x i32>, <4 x i32>* %B 351 %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 352 %ext2 = zext <2 x i32> %high2 to <2 x i64> 353 354 %res = add <2 x i64> %tmp1, %ext2 355 ret <2 x i64> %res 356} 357 358define <8 x i16> @saddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind { 359;CHECK-LABEL: saddw8h: 360;CHECK: saddw.8h 361 %tmp1 = load <8 x i16>, <8 x i16>* %A 362 %tmp2 = load <8 x i8>, <8 x i8>* %B 363 %tmp3 = sext <8 x i8> %tmp2 to <8 x i16> 364 %tmp4 = add <8 x i16> %tmp1, %tmp3 365 ret <8 x i16> %tmp4 366} 367 368define <4 x i32> @saddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind { 369;CHECK-LABEL: saddw4s: 370;CHECK: saddw.4s 371 %tmp1 = load <4 x i32>, <4 x i32>* %A 372 %tmp2 = load <4 x i16>, <4 x i16>* %B 373 %tmp3 = sext <4 x i16> %tmp2 to <4 x i32> 374 %tmp4 = add <4 x i32> %tmp1, %tmp3 375 ret <4 x i32> %tmp4 376} 377 378define <2 x i64> @saddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind { 379;CHECK-LABEL: saddw2d: 380;CHECK: saddw.2d 381 %tmp1 = load <2 x i64>, <2 x i64>* %A 382 %tmp2 = load <2 x i32>, <2 x i32>* %B 383 %tmp3 = sext <2 x i32> %tmp2 to <2 x i64> 384 %tmp4 = add <2 x i64> %tmp1, %tmp3 385 ret <2 x i64> %tmp4 386} 387 388define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind { 389;CHECK-LABEL: saddw2_8h: 390;CHECK: saddw2.8h 391 %tmp1 = load <8 x i16>, <8 x i16>* %A 392 393 %tmp2 = load <16 x i8>, <16 x i8>* %B 394 %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 395 %ext2 = sext <8 x i8> %high2 to <8 x i16> 396 397 %res = add <8 x i16> %tmp1, %ext2 398 ret <8 x i16> %res 399} 400 401define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind { 402;CHECK-LABEL: saddw2_4s: 403;CHECK: saddw2.4s 404 %tmp1 = load <4 x i32>, <4 x i32>* %A 405 406 %tmp2 = load <8 x i16>, <8 x i16>* %B 407 %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 408 %ext2 = sext <4 x i16> %high2 to <4 x i32> 409 410 %res = add <4 x i32> %tmp1, %ext2 411 ret <4 x i32> %res 412} 413 414define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind { 415;CHECK-LABEL: saddw2_2d: 416;CHECK: saddw2.2d 417 %tmp1 = load <2 x i64>, <2 x i64>* %A 418 419 %tmp2 = load <4 x i32>, <4 x i32>* %B 420 %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 421 %ext2 = sext <2 x i32> %high2 to <2 x i64> 422 423 %res = add <2 x i64> %tmp1, %ext2 424 ret <2 x i64> %res 425} 426 427define <4 x i16> @saddlp4h(<8 x i8>* %A) nounwind { 428;CHECK-LABEL: saddlp4h: 429;CHECK: saddlp.4h 430 %tmp1 = load <8 x i8>, <8 x i8>* %A 431 %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1) 432 ret <4 x i16> %tmp3 433} 434 435define <2 x i32> @saddlp2s(<4 x i16>* %A) nounwind { 436;CHECK-LABEL: saddlp2s: 437;CHECK: saddlp.2s 438 %tmp1 = load <4 x i16>, <4 x i16>* %A 439 %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1) 440 ret <2 x i32> %tmp3 441} 442 443define <1 x i64> @saddlp1d(<2 x i32>* %A) nounwind { 444;CHECK-LABEL: saddlp1d: 445;CHECK: saddlp.1d 446 %tmp1 = load <2 x i32>, <2 x i32>* %A 447 %tmp3 = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1) 448 ret <1 x i64> %tmp3 449} 450 451define <8 x i16> @saddlp8h(<16 x i8>* %A) nounwind { 452;CHECK-LABEL: saddlp8h: 453;CHECK: saddlp.8h 454 %tmp1 = load <16 x i8>, <16 x i8>* %A 455 %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1) 456 ret <8 x i16> %tmp3 457} 458 459define <4 x i32> @saddlp4s(<8 x i16>* %A) nounwind { 460;CHECK-LABEL: saddlp4s: 461;CHECK: saddlp.4s 462 %tmp1 = load <8 x i16>, <8 x i16>* %A 463 %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1) 464 ret <4 x i32> %tmp3 465} 466 467define <2 x i64> @saddlp2d(<4 x i32>* %A) nounwind { 468;CHECK-LABEL: saddlp2d: 469;CHECK: saddlp.2d 470 %tmp1 = load <4 x i32>, <4 x i32>* %A 471 %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1) 472 ret <2 x i64> %tmp3 473} 474 475declare <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone 476declare <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone 477declare <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone 478 479declare <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone 480declare <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone 481declare <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone 482 483define <4 x i16> @uaddlp4h(<8 x i8>* %A) nounwind { 484;CHECK-LABEL: uaddlp4h: 485;CHECK: uaddlp.4h 486 %tmp1 = load <8 x i8>, <8 x i8>* %A 487 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1) 488 ret <4 x i16> %tmp3 489} 490 491define <2 x i32> @uaddlp2s(<4 x i16>* %A) nounwind { 492;CHECK-LABEL: uaddlp2s: 493;CHECK: uaddlp.2s 494 %tmp1 = load <4 x i16>, <4 x i16>* %A 495 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1) 496 ret <2 x i32> %tmp3 497} 498 499define <1 x i64> @uaddlp1d(<2 x i32>* %A) nounwind { 500;CHECK-LABEL: uaddlp1d: 501;CHECK: uaddlp.1d 502 %tmp1 = load <2 x i32>, <2 x i32>* %A 503 %tmp3 = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1) 504 ret <1 x i64> %tmp3 505} 506 507define <8 x i16> @uaddlp8h(<16 x i8>* %A) nounwind { 508;CHECK-LABEL: uaddlp8h: 509;CHECK: uaddlp.8h 510 %tmp1 = load <16 x i8>, <16 x i8>* %A 511 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1) 512 ret <8 x i16> %tmp3 513} 514 515define <4 x i32> @uaddlp4s(<8 x i16>* %A) nounwind { 516;CHECK-LABEL: uaddlp4s: 517;CHECK: uaddlp.4s 518 %tmp1 = load <8 x i16>, <8 x i16>* %A 519 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1) 520 ret <4 x i32> %tmp3 521} 522 523define <2 x i64> @uaddlp2d(<4 x i32>* %A) nounwind { 524;CHECK-LABEL: uaddlp2d: 525;CHECK: uaddlp.2d 526 %tmp1 = load <4 x i32>, <4 x i32>* %A 527 %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1) 528 ret <2 x i64> %tmp3 529} 530 531declare <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone 532declare <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone 533declare <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone 534 535declare <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone 536declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone 537declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone 538 539define <4 x i16> @sadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind { 540;CHECK-LABEL: sadalp4h: 541;CHECK: sadalp.4h 542 %tmp1 = load <8 x i8>, <8 x i8>* %A 543 %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1) 544 %tmp4 = load <4 x i16>, <4 x i16>* %B 545 %tmp5 = add <4 x i16> %tmp3, %tmp4 546 ret <4 x i16> %tmp5 547} 548 549define <2 x i32> @sadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind { 550;CHECK-LABEL: sadalp2s: 551;CHECK: sadalp.2s 552 %tmp1 = load <4 x i16>, <4 x i16>* %A 553 %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1) 554 %tmp4 = load <2 x i32>, <2 x i32>* %B 555 %tmp5 = add <2 x i32> %tmp3, %tmp4 556 ret <2 x i32> %tmp5 557} 558 559define <8 x i16> @sadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind { 560;CHECK-LABEL: sadalp8h: 561;CHECK: sadalp.8h 562 %tmp1 = load <16 x i8>, <16 x i8>* %A 563 %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1) 564 %tmp4 = load <8 x i16>, <8 x i16>* %B 565 %tmp5 = add <8 x i16> %tmp3, %tmp4 566 ret <8 x i16> %tmp5 567} 568 569define <4 x i32> @sadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind { 570;CHECK-LABEL: sadalp4s: 571;CHECK: sadalp.4s 572 %tmp1 = load <8 x i16>, <8 x i16>* %A 573 %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1) 574 %tmp4 = load <4 x i32>, <4 x i32>* %B 575 %tmp5 = add <4 x i32> %tmp3, %tmp4 576 ret <4 x i32> %tmp5 577} 578 579define <2 x i64> @sadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind { 580;CHECK-LABEL: sadalp2d: 581;CHECK: sadalp.2d 582 %tmp1 = load <4 x i32>, <4 x i32>* %A 583 %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1) 584 %tmp4 = load <2 x i64>, <2 x i64>* %B 585 %tmp5 = add <2 x i64> %tmp3, %tmp4 586 ret <2 x i64> %tmp5 587} 588 589define <4 x i16> @uadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind { 590;CHECK-LABEL: uadalp4h: 591;CHECK: uadalp.4h 592 %tmp1 = load <8 x i8>, <8 x i8>* %A 593 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1) 594 %tmp4 = load <4 x i16>, <4 x i16>* %B 595 %tmp5 = add <4 x i16> %tmp3, %tmp4 596 ret <4 x i16> %tmp5 597} 598 599define <2 x i32> @uadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind { 600;CHECK-LABEL: uadalp2s: 601;CHECK: uadalp.2s 602 %tmp1 = load <4 x i16>, <4 x i16>* %A 603 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1) 604 %tmp4 = load <2 x i32>, <2 x i32>* %B 605 %tmp5 = add <2 x i32> %tmp3, %tmp4 606 ret <2 x i32> %tmp5 607} 608 609define <8 x i16> @uadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind { 610;CHECK-LABEL: uadalp8h: 611;CHECK: uadalp.8h 612 %tmp1 = load <16 x i8>, <16 x i8>* %A 613 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1) 614 %tmp4 = load <8 x i16>, <8 x i16>* %B 615 %tmp5 = add <8 x i16> %tmp3, %tmp4 616 ret <8 x i16> %tmp5 617} 618 619define <4 x i32> @uadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind { 620;CHECK-LABEL: uadalp4s: 621;CHECK: uadalp.4s 622 %tmp1 = load <8 x i16>, <8 x i16>* %A 623 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1) 624 %tmp4 = load <4 x i32>, <4 x i32>* %B 625 %tmp5 = add <4 x i32> %tmp3, %tmp4 626 ret <4 x i32> %tmp5 627} 628 629define <2 x i64> @uadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind { 630;CHECK-LABEL: uadalp2d: 631;CHECK: uadalp.2d 632 %tmp1 = load <4 x i32>, <4 x i32>* %A 633 %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1) 634 %tmp4 = load <2 x i64>, <2 x i64>* %B 635 %tmp5 = add <2 x i64> %tmp3, %tmp4 636 ret <2 x i64> %tmp5 637} 638 639define <8 x i8> @addp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { 640;CHECK-LABEL: addp_8b: 641;CHECK: addp.8b 642 %tmp1 = load <8 x i8>, <8 x i8>* %A 643 %tmp2 = load <8 x i8>, <8 x i8>* %B 644 %tmp3 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 645 ret <8 x i8> %tmp3 646} 647 648define <16 x i8> @addp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { 649;CHECK-LABEL: addp_16b: 650;CHECK: addp.16b 651 %tmp1 = load <16 x i8>, <16 x i8>* %A 652 %tmp2 = load <16 x i8>, <16 x i8>* %B 653 %tmp3 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 654 ret <16 x i8> %tmp3 655} 656 657define <4 x i16> @addp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 658;CHECK-LABEL: addp_4h: 659;CHECK: addp.4h 660 %tmp1 = load <4 x i16>, <4 x i16>* %A 661 %tmp2 = load <4 x i16>, <4 x i16>* %B 662 %tmp3 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 663 ret <4 x i16> %tmp3 664} 665 666define <8 x i16> @addp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 667;CHECK-LABEL: addp_8h: 668;CHECK: addp.8h 669 %tmp1 = load <8 x i16>, <8 x i16>* %A 670 %tmp2 = load <8 x i16>, <8 x i16>* %B 671 %tmp3 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 672 ret <8 x i16> %tmp3 673} 674 675define <2 x i32> @addp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 676;CHECK-LABEL: addp_2s: 677;CHECK: addp.2s 678 %tmp1 = load <2 x i32>, <2 x i32>* %A 679 %tmp2 = load <2 x i32>, <2 x i32>* %B 680 %tmp3 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 681 ret <2 x i32> %tmp3 682} 683 684define <4 x i32> @addp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 685;CHECK-LABEL: addp_4s: 686;CHECK: addp.4s 687 %tmp1 = load <4 x i32>, <4 x i32>* %A 688 %tmp2 = load <4 x i32>, <4 x i32>* %B 689 %tmp3 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 690 ret <4 x i32> %tmp3 691} 692 693define <2 x i64> @addp_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind { 694;CHECK-LABEL: addp_2d: 695;CHECK: addp.2d 696 %tmp1 = load <2 x i64>, <2 x i64>* %A 697 %tmp2 = load <2 x i64>, <2 x i64>* %B 698 %tmp3 = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) 699 ret <2 x i64> %tmp3 700} 701 702declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 703declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 704declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 705declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 706declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 707declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 708declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone 709 710define <2 x float> @faddp_2s(<2 x float>* %A, <2 x float>* %B) nounwind { 711;CHECK-LABEL: faddp_2s: 712;CHECK: faddp.2s 713 %tmp1 = load <2 x float>, <2 x float>* %A 714 %tmp2 = load <2 x float>, <2 x float>* %B 715 %tmp3 = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) 716 ret <2 x float> %tmp3 717} 718 719define <4 x float> @faddp_4s(<4 x float>* %A, <4 x float>* %B) nounwind { 720;CHECK-LABEL: faddp_4s: 721;CHECK: faddp.4s 722 %tmp1 = load <4 x float>, <4 x float>* %A 723 %tmp2 = load <4 x float>, <4 x float>* %B 724 %tmp3 = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) 725 ret <4 x float> %tmp3 726} 727 728define <2 x double> @faddp_2d(<2 x double>* %A, <2 x double>* %B) nounwind { 729;CHECK-LABEL: faddp_2d: 730;CHECK: faddp.2d 731 %tmp1 = load <2 x double>, <2 x double>* %A 732 %tmp2 = load <2 x double>, <2 x double>* %B 733 %tmp3 = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2) 734 ret <2 x double> %tmp3 735} 736 737declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone 738declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone 739declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone 740 741define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) { 742; CHECK-LABEL: uaddl2_duprhs 743; CHECK-NOT: ext.16b 744; CHECK: uaddl2.2d 745 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 746 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 747 748 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 749 750 %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64> 751 %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64> 752 753 %res = add <2 x i64> %lhs.ext, %rhs.ext 754 ret <2 x i64> %res 755} 756 757define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) { 758; CHECK-LABEL: saddl2_duplhs 759; CHECK-NOT: ext.16b 760; CHECK: saddl2.2d 761 %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0 762 %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1 763 764 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 765 766 %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64> 767 %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64> 768 769 %res = add <2 x i64> %lhs.ext, %rhs.ext 770 ret <2 x i64> %res 771} 772 773define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) { 774; CHECK-LABEL: usubl2_duprhs 775; CHECK-NOT: ext.16b 776; CHECK: usubl2.2d 777 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 778 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 779 780 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 781 782 %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64> 783 %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64> 784 785 %res = sub <2 x i64> %lhs.ext, %rhs.ext 786 ret <2 x i64> %res 787} 788 789define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) { 790; CHECK-LABEL: ssubl2_duplhs 791; CHECK-NOT: ext.16b 792; CHECK: ssubl2.2d 793 %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0 794 %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1 795 796 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 797 798 %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64> 799 %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64> 800 801 %res = sub <2 x i64> %lhs.ext, %rhs.ext 802 ret <2 x i64> %res 803} 804 805define <8 x i8> @addhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind { 806;CHECK-LABEL: addhn8b_natural: 807;CHECK: addhn.8b 808 %tmp1 = load <8 x i16>, <8 x i16>* %A 809 %tmp2 = load <8 x i16>, <8 x i16>* %B 810 %sum = add <8 x i16> %tmp1, %tmp2 811 %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 812 %narrowed = trunc <8 x i16> %high_bits to <8 x i8> 813 ret <8 x i8> %narrowed 814} 815 816define <4 x i16> @addhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind { 817;CHECK-LABEL: addhn4h_natural: 818;CHECK: addhn.4h 819 %tmp1 = load <4 x i32>, <4 x i32>* %A 820 %tmp2 = load <4 x i32>, <4 x i32>* %B 821 %sum = add <4 x i32> %tmp1, %tmp2 822 %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16> 823 %narrowed = trunc <4 x i32> %high_bits to <4 x i16> 824 ret <4 x i16> %narrowed 825} 826 827define <2 x i32> @addhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind { 828;CHECK-LABEL: addhn2s_natural: 829;CHECK: addhn.2s 830 %tmp1 = load <2 x i64>, <2 x i64>* %A 831 %tmp2 = load <2 x i64>, <2 x i64>* %B 832 %sum = add <2 x i64> %tmp1, %tmp2 833 %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32> 834 %narrowed = trunc <2 x i64> %high_bits to <2 x i32> 835 ret <2 x i32> %narrowed 836} 837 838define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind { 839;CHECK-LABEL: addhn2_16b_natural: 840;CHECK: addhn2.16b 841 %tmp1 = load <8 x i16>, <8 x i16>* %A 842 %tmp2 = load <8 x i16>, <8 x i16>* %B 843 %sum = add <8 x i16> %tmp1, %tmp2 844 %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 845 %narrowed = trunc <8 x i16> %high_bits to <8 x i8> 846 %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 847 ret <16 x i8> %res 848} 849 850define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind { 851;CHECK-LABEL: addhn2_8h_natural: 852;CHECK: addhn2.8h 853 %tmp1 = load <4 x i32>, <4 x i32>* %A 854 %tmp2 = load <4 x i32>, <4 x i32>* %B 855 %sum = add <4 x i32> %tmp1, %tmp2 856 %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16> 857 %narrowed = trunc <4 x i32> %high_bits to <4 x i16> 858 %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 859 ret <8 x i16> %res 860} 861 862define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind { 863;CHECK-LABEL: addhn2_4s_natural: 864;CHECK: addhn2.4s 865 %tmp1 = load <2 x i64>, <2 x i64>* %A 866 %tmp2 = load <2 x i64>, <2 x i64>* %B 867 %sum = add <2 x i64> %tmp1, %tmp2 868 %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32> 869 %narrowed = trunc <2 x i64> %high_bits to <2 x i32> 870 %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 871 ret <4 x i32> %res 872} 873 874define <8 x i8> @subhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind { 875;CHECK-LABEL: subhn8b_natural: 876;CHECK: subhn.8b 877 %tmp1 = load <8 x i16>, <8 x i16>* %A 878 %tmp2 = load <8 x i16>, <8 x i16>* %B 879 %diff = sub <8 x i16> %tmp1, %tmp2 880 %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 881 %narrowed = trunc <8 x i16> %high_bits to <8 x i8> 882 ret <8 x i8> %narrowed 883} 884 885define <4 x i16> @subhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind { 886;CHECK-LABEL: subhn4h_natural: 887;CHECK: subhn.4h 888 %tmp1 = load <4 x i32>, <4 x i32>* %A 889 %tmp2 = load <4 x i32>, <4 x i32>* %B 890 %diff = sub <4 x i32> %tmp1, %tmp2 891 %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16> 892 %narrowed = trunc <4 x i32> %high_bits to <4 x i16> 893 ret <4 x i16> %narrowed 894} 895 896define <2 x i32> @subhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind { 897;CHECK-LABEL: subhn2s_natural: 898;CHECK: subhn.2s 899 %tmp1 = load <2 x i64>, <2 x i64>* %A 900 %tmp2 = load <2 x i64>, <2 x i64>* %B 901 %diff = sub <2 x i64> %tmp1, %tmp2 902 %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32> 903 %narrowed = trunc <2 x i64> %high_bits to <2 x i32> 904 ret <2 x i32> %narrowed 905} 906 907define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind { 908;CHECK-LABEL: subhn2_16b_natural: 909;CHECK: subhn2.16b 910 %tmp1 = load <8 x i16>, <8 x i16>* %A 911 %tmp2 = load <8 x i16>, <8 x i16>* %B 912 %diff = sub <8 x i16> %tmp1, %tmp2 913 %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 914 %narrowed = trunc <8 x i16> %high_bits to <8 x i8> 915 %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 916 ret <16 x i8> %res 917} 918 919define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind { 920;CHECK-LABEL: subhn2_8h_natural: 921;CHECK: subhn2.8h 922 %tmp1 = load <4 x i32>, <4 x i32>* %A 923 %tmp2 = load <4 x i32>, <4 x i32>* %B 924 %diff = sub <4 x i32> %tmp1, %tmp2 925 %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16> 926 %narrowed = trunc <4 x i32> %high_bits to <4 x i16> 927 %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 928 ret <8 x i16> %res 929} 930 931define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind { 932;CHECK-LABEL: subhn2_4s_natural: 933;CHECK: subhn2.4s 934 %tmp1 = load <2 x i64>, <2 x i64>* %A 935 %tmp2 = load <2 x i64>, <2 x i64>* %B 936 %diff = sub <2 x i64> %tmp1, %tmp2 937 %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32> 938 %narrowed = trunc <2 x i64> %high_bits to <2 x i32> 939 %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 940 ret <4 x i32> %res 941} 942