1; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s 2 3define <8 x i8> @vsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind { 4;CHECK: vsras8: 5;CHECK: vsra.s8 6 %tmp1 = load <8 x i8>* %A 7 %tmp2 = load <8 x i8>* %B 8 %tmp3 = ashr <8 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > 9 %tmp4 = add <8 x i8> %tmp1, %tmp3 10 ret <8 x i8> %tmp4 11} 12 13define <4 x i16> @vsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 14;CHECK: vsras16: 15;CHECK: vsra.s16 16 %tmp1 = load <4 x i16>* %A 17 %tmp2 = load <4 x i16>* %B 18 %tmp3 = ashr <4 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16 > 19 %tmp4 = add <4 x i16> %tmp1, %tmp3 20 ret <4 x i16> %tmp4 21} 22 23define <2 x i32> @vsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 24;CHECK: vsras32: 25;CHECK: vsra.s32 26 %tmp1 = load <2 x i32>* %A 27 %tmp2 = load <2 x i32>* %B 28 %tmp3 = ashr <2 x i32> %tmp2, < i32 32, i32 32 > 29 %tmp4 = add <2 x i32> %tmp1, %tmp3 30 ret <2 x i32> %tmp4 31} 32 33define <1 x i64> @vsras64(<1 x i64>* %A, <1 x i64>* %B) nounwind { 34;CHECK: vsras64: 35;CHECK: vsra.s64 36 %tmp1 = load <1 x i64>* %A 37 %tmp2 = load <1 x i64>* %B 38 %tmp3 = ashr <1 x i64> %tmp2, < i64 64 > 39 %tmp4 = add <1 x i64> %tmp1, %tmp3 40 ret <1 x i64> %tmp4 41} 42 43define <16 x i8> @vsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { 44;CHECK: vsraQs8: 45;CHECK: vsra.s8 46 %tmp1 = load <16 x i8>* %A 47 %tmp2 = load <16 x i8>* %B 48 %tmp3 = ashr <16 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > 49 %tmp4 = add <16 x i8> %tmp1, %tmp3 50 ret <16 x i8> %tmp4 51} 52 53define <8 x i16> @vsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 54;CHECK: vsraQs16: 55;CHECK: vsra.s16 56 %tmp1 = load <8 x i16>* %A 57 %tmp2 = load <8 x i16>* %B 58 %tmp3 = ashr <8 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 > 59 %tmp4 = add <8 x i16> %tmp1, %tmp3 60 ret <8 x i16> %tmp4 61} 62 63define <4 x i32> @vsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 64;CHECK: vsraQs32: 65;CHECK: vsra.s32 66 %tmp1 = load <4 x i32>* %A 67 %tmp2 = load <4 x i32>* %B 68 %tmp3 = ashr <4 x i32> %tmp2, < i32 32, i32 32, i32 32, i32 32 > 69 %tmp4 = add <4 x i32> %tmp1, %tmp3 70 ret <4 x i32> %tmp4 71} 72 73define <2 x i64> @vsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { 74;CHECK: vsraQs64: 75;CHECK: vsra.s64 76 %tmp1 = load <2 x i64>* %A 77 %tmp2 = load <2 x i64>* %B 78 %tmp3 = ashr <2 x i64> %tmp2, < i64 64, i64 64 > 79 %tmp4 = add <2 x i64> %tmp1, %tmp3 80 ret <2 x i64> %tmp4 81} 82 83define <8 x i8> @vsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind { 84;CHECK: vsrau8: 85;CHECK: vsra.u8 86 %tmp1 = load <8 x i8>* %A 87 %tmp2 = load <8 x i8>* %B 88 %tmp3 = lshr <8 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > 89 %tmp4 = add <8 x i8> %tmp1, %tmp3 90 ret <8 x i8> %tmp4 91} 92 93define <4 x i16> @vsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 94;CHECK: vsrau16: 95;CHECK: vsra.u16 96 %tmp1 = load <4 x i16>* %A 97 %tmp2 = load <4 x i16>* %B 98 %tmp3 = lshr <4 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16 > 99 %tmp4 = add <4 x i16> %tmp1, %tmp3 100 ret <4 x i16> %tmp4 101} 102 103define <2 x i32> @vsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 104;CHECK: vsrau32: 105;CHECK: vsra.u32 106 %tmp1 = load <2 x i32>* %A 107 %tmp2 = load <2 x i32>* %B 108 %tmp3 = lshr <2 x i32> %tmp2, < i32 32, i32 32 > 109 %tmp4 = add <2 x i32> %tmp1, %tmp3 110 ret <2 x i32> %tmp4 111} 112 113define <1 x i64> @vsrau64(<1 x i64>* %A, <1 x i64>* %B) nounwind { 114;CHECK: vsrau64: 115;CHECK: vsra.u64 116 %tmp1 = load <1 x i64>* %A 117 %tmp2 = load <1 x i64>* %B 118 %tmp3 = lshr <1 x i64> %tmp2, < i64 64 > 119 %tmp4 = add <1 x i64> %tmp1, %tmp3 120 ret <1 x i64> %tmp4 121} 122 123define <16 x i8> @vsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { 124;CHECK: vsraQu8: 125;CHECK: vsra.u8 126 %tmp1 = load <16 x i8>* %A 127 %tmp2 = load <16 x i8>* %B 128 %tmp3 = lshr <16 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > 129 %tmp4 = add <16 x i8> %tmp1, %tmp3 130 ret <16 x i8> %tmp4 131} 132 133define <8 x i16> @vsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 134;CHECK: vsraQu16: 135;CHECK: vsra.u16 136 %tmp1 = load <8 x i16>* %A 137 %tmp2 = load <8 x i16>* %B 138 %tmp3 = lshr <8 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 > 139 %tmp4 = add <8 x i16> %tmp1, %tmp3 140 ret <8 x i16> %tmp4 141} 142 143define <4 x i32> @vsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 144;CHECK: vsraQu32: 145;CHECK: vsra.u32 146 %tmp1 = load <4 x i32>* %A 147 %tmp2 = load <4 x i32>* %B 148 %tmp3 = lshr <4 x i32> %tmp2, < i32 32, i32 32, i32 32, i32 32 > 149 %tmp4 = add <4 x i32> %tmp1, %tmp3 150 ret <4 x i32> %tmp4 151} 152 153define <2 x i64> @vsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { 154;CHECK: vsraQu64: 155;CHECK: vsra.u64 156 %tmp1 = load <2 x i64>* %A 157 %tmp2 = load <2 x i64>* %B 158 %tmp3 = lshr <2 x i64> %tmp2, < i64 64, i64 64 > 159 %tmp4 = add <2 x i64> %tmp1, %tmp3 160 ret <2 x i64> %tmp4 161} 162 163define <8 x i8> @vrsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind { 164;CHECK: vrsras8: 165;CHECK: vrsra.s8 166 %tmp1 = load <8 x i8>* %A 167 %tmp2 = load <8 x i8>* %B 168 %tmp3 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp2, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) 169 %tmp4 = add <8 x i8> %tmp1, %tmp3 170 ret <8 x i8> %tmp4 171} 172 173define <4 x i16> @vrsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 174;CHECK: vrsras16: 175;CHECK: vrsra.s16 176 %tmp1 = load <4 x i16>* %A 177 %tmp2 = load <4 x i16>* %B 178 %tmp3 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp2, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >) 179 %tmp4 = add <4 x i16> %tmp1, %tmp3 180 ret <4 x i16> %tmp4 181} 182 183define <2 x i32> @vrsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 184;CHECK: vrsras32: 185;CHECK: vrsra.s32 186 %tmp1 = load <2 x i32>* %A 187 %tmp2 = load <2 x i32>* %B 188 %tmp3 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp2, <2 x i32> < i32 -32, i32 -32 >) 189 %tmp4 = add <2 x i32> %tmp1, %tmp3 190 ret <2 x i32> %tmp4 191} 192 193define <1 x i64> @vrsras64(<1 x i64>* %A, <1 x i64>* %B) nounwind { 194;CHECK: vrsras64: 195;CHECK: vrsra.s64 196 %tmp1 = load <1 x i64>* %A 197 %tmp2 = load <1 x i64>* %B 198 %tmp3 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp2, <1 x i64> < i64 -64 >) 199 %tmp4 = add <1 x i64> %tmp1, %tmp3 200 ret <1 x i64> %tmp4 201} 202 203define <8 x i8> @vrsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind { 204;CHECK: vrsrau8: 205;CHECK: vrsra.u8 206 %tmp1 = load <8 x i8>* %A 207 %tmp2 = load <8 x i8>* %B 208 %tmp3 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp2, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) 209 %tmp4 = add <8 x i8> %tmp1, %tmp3 210 ret <8 x i8> %tmp4 211} 212 213define <4 x i16> @vrsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 214;CHECK: vrsrau16: 215;CHECK: vrsra.u16 216 %tmp1 = load <4 x i16>* %A 217 %tmp2 = load <4 x i16>* %B 218 %tmp3 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp2, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >) 219 %tmp4 = add <4 x i16> %tmp1, %tmp3 220 ret <4 x i16> %tmp4 221} 222 223define <2 x i32> @vrsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 224;CHECK: vrsrau32: 225;CHECK: vrsra.u32 226 %tmp1 = load <2 x i32>* %A 227 %tmp2 = load <2 x i32>* %B 228 %tmp3 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp2, <2 x i32> < i32 -32, i32 -32 >) 229 %tmp4 = add <2 x i32> %tmp1, %tmp3 230 ret <2 x i32> %tmp4 231} 232 233define <1 x i64> @vrsrau64(<1 x i64>* %A, <1 x i64>* %B) nounwind { 234;CHECK: vrsrau64: 235;CHECK: vrsra.u64 236 %tmp1 = load <1 x i64>* %A 237 %tmp2 = load <1 x i64>* %B 238 %tmp3 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp2, <1 x i64> < i64 -64 >) 239 %tmp4 = add <1 x i64> %tmp1, %tmp3 240 ret <1 x i64> %tmp4 241} 242 243define <16 x i8> @vrsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { 244;CHECK: vrsraQs8: 245;CHECK: vrsra.s8 246 %tmp1 = load <16 x i8>* %A 247 %tmp2 = load <16 x i8>* %B 248 %tmp3 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp2, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) 249 %tmp4 = add <16 x i8> %tmp1, %tmp3 250 ret <16 x i8> %tmp4 251} 252 253define <8 x i16> @vrsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 254;CHECK: vrsraQs16: 255;CHECK: vrsra.s16 256 %tmp1 = load <8 x i16>* %A 257 %tmp2 = load <8 x i16>* %B 258 %tmp3 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp2, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >) 259 %tmp4 = add <8 x i16> %tmp1, %tmp3 260 ret <8 x i16> %tmp4 261} 262 263define <4 x i32> @vrsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 264;CHECK: vrsraQs32: 265;CHECK: vrsra.s32 266 %tmp1 = load <4 x i32>* %A 267 %tmp2 = load <4 x i32>* %B 268 %tmp3 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp2, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >) 269 %tmp4 = add <4 x i32> %tmp1, %tmp3 270 ret <4 x i32> %tmp4 271} 272 273define <2 x i64> @vrsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { 274;CHECK: vrsraQs64: 275;CHECK: vrsra.s64 276 %tmp1 = load <2 x i64>* %A 277 %tmp2 = load <2 x i64>* %B 278 %tmp3 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp2, <2 x i64> < i64 -64, i64 -64 >) 279 %tmp4 = add <2 x i64> %tmp1, %tmp3 280 ret <2 x i64> %tmp4 281} 282 283define <16 x i8> @vrsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { 284;CHECK: vrsraQu8: 285;CHECK: vrsra.u8 286 %tmp1 = load <16 x i8>* %A 287 %tmp2 = load <16 x i8>* %B 288 %tmp3 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp2, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) 289 %tmp4 = add <16 x i8> %tmp1, %tmp3 290 ret <16 x i8> %tmp4 291} 292 293define <8 x i16> @vrsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 294;CHECK: vrsraQu16: 295;CHECK: vrsra.u16 296 %tmp1 = load <8 x i16>* %A 297 %tmp2 = load <8 x i16>* %B 298 %tmp3 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp2, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >) 299 %tmp4 = add <8 x i16> %tmp1, %tmp3 300 ret <8 x i16> %tmp4 301} 302 303define <4 x i32> @vrsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 304;CHECK: vrsraQu32: 305;CHECK: vrsra.u32 306 %tmp1 = load <4 x i32>* %A 307 %tmp2 = load <4 x i32>* %B 308 %tmp3 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp2, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >) 309 %tmp4 = add <4 x i32> %tmp1, %tmp3 310 ret <4 x i32> %tmp4 311} 312 313define <2 x i64> @vrsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { 314;CHECK: vrsraQu64: 315;CHECK: vrsra.u64 316 %tmp1 = load <2 x i64>* %A 317 %tmp2 = load <2 x i64>* %B 318 %tmp3 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp2, <2 x i64> < i64 -64, i64 -64 >) 319 %tmp4 = add <2 x i64> %tmp1, %tmp3 320 ret <2 x i64> %tmp4 321} 322 323declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 324declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 325declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 326declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone 327 328declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 329declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 330declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 331declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone 332 333declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 334declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 335declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 336declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone 337 338declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 339declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 340declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 341declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone 342