1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s 3 4define arm_aapcs_vfpcc <8 x half> @test_vfmaq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) { 5; CHECK-LABEL: test_vfmaq_f16: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: vfma.f16 q0, q1, q2 8; CHECK-NEXT: bx lr 9entry: 10 %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a) 11 ret <8 x half> %0 12} 13 14define arm_aapcs_vfpcc <4 x float> @test_vfmaq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) { 15; CHECK-LABEL: test_vfmaq_f32: 16; CHECK: @ %bb.0: @ %entry 17; CHECK-NEXT: vfma.f32 q0, q1, q2 18; CHECK-NEXT: bx lr 19entry: 20 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %c, <4 x float> %a) 21 ret <4 x float> %0 22} 23 24define arm_aapcs_vfpcc <8 x half> @test_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce) { 25; CHECK-LABEL: test_vfmaq_n_f16: 26; CHECK: @ %bb.0: @ %entry 27; CHECK-NEXT: vmov r0, s8 28; CHECK-NEXT: vfma.f16 q0, q1, r0 29; CHECK-NEXT: bx lr 30entry: 31 %0 = bitcast float %c.coerce to i32 32 %tmp.0.extract.trunc = trunc i32 %0 to i16 33 %1 = bitcast i16 %tmp.0.extract.trunc to half 34 %.splatinsert = insertelement <8 x half> undef, half %1, i32 0 35 %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer 36 %2 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %.splat, <8 x half> %a) 37 ret <8 x half> %2 38} 39 40define arm_aapcs_vfpcc <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %c) { 41; CHECK-LABEL: test_vfmaq_n_f32: 42; CHECK: @ %bb.0: @ %entry 43; CHECK-NEXT: vmov r0, s8 44; CHECK-NEXT: vfma.f32 q0, q1, r0 45; CHECK-NEXT: bx lr 46entry: 47 %.splatinsert = insertelement <4 x float> undef, float %c, i32 0 48 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer 49 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %.splat, <4 x float> %a) 50 ret <4 x float> %0 51} 52 53define arm_aapcs_vfpcc <8 x half> @test_vfmasq_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce) { 54; CHECK-LABEL: test_vfmasq_n_f16: 55; CHECK: @ %bb.0: @ %entry 56; CHECK-NEXT: vmov r0, s8 57; CHECK-NEXT: vfmas.f16 q0, q1, r0 58; CHECK-NEXT: bx lr 59entry: 60 %0 = bitcast float %c.coerce to i32 61 %tmp.0.extract.trunc = trunc i32 %0 to i16 62 %1 = bitcast i16 %tmp.0.extract.trunc to half 63 %.splatinsert = insertelement <8 x half> undef, half %1, i32 0 64 %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer 65 %2 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %.splat) 66 ret <8 x half> %2 67} 68 69define arm_aapcs_vfpcc <4 x float> @test_vfmasq_n_f32(<4 x float> %a, <4 x float> %b, float %c) { 70; CHECK-LABEL: test_vfmasq_n_f32: 71; CHECK: @ %bb.0: @ %entry 72; CHECK-NEXT: vmov r0, s8 73; CHECK-NEXT: vfmas.f32 q0, q1, r0 74; CHECK-NEXT: bx lr 75entry: 76 %.splatinsert = insertelement <4 x float> undef, float %c, i32 0 77 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer 78 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %.splat) 79 ret <4 x float> %0 80} 81 82define arm_aapcs_vfpcc <8 x half> @test_vfmsq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) { 83; CHECK-LABEL: test_vfmsq_f16: 84; CHECK: @ %bb.0: @ %entry 85; CHECK-NEXT: vfms.f16 q0, q2, q1 86; CHECK-NEXT: bx lr 87entry: 88 %0 = fneg <8 x half> %c 89 %1 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %0, <8 x half> %a) 90 ret <8 x half> %1 91} 92 93define arm_aapcs_vfpcc <4 x float> @test_vfmsq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) { 94; CHECK-LABEL: test_vfmsq_f32: 95; CHECK: @ %bb.0: @ %entry 96; CHECK-NEXT: vfms.f32 q0, q2, q1 97; CHECK-NEXT: bx lr 98entry: 99 %0 = fneg <4 x float> %c 100 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %0, <4 x float> %a) 101 ret <4 x float> %1 102} 103 104define arm_aapcs_vfpcc <16 x i8> @test_vmlaq_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c) { 105; CHECK-LABEL: test_vmlaq_n_s8: 106; CHECK: @ %bb.0: @ %entry 107; CHECK-NEXT: vmla.u8 q0, q1, r0 108; CHECK-NEXT: bx lr 109entry: 110 %.splatinsert = insertelement <16 x i8> undef, i8 %c, i32 0 111 %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer 112 %0 = mul <16 x i8> %.splat, %b 113 %1 = add <16 x i8> %0, %a 114 ret <16 x i8> %1 115} 116 117define arm_aapcs_vfpcc <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) { 118; CHECK-LABEL: test_vmlaq_n_s16: 119; CHECK: @ %bb.0: @ %entry 120; CHECK-NEXT: vmla.u16 q0, q1, r0 121; CHECK-NEXT: bx lr 122entry: 123 %.splatinsert = insertelement <8 x i16> undef, i16 %c, i32 0 124 %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer 125 %0 = mul <8 x i16> %.splat, %b 126 %1 = add <8 x i16> %0, %a 127 ret <8 x i16> %1 128} 129 130define arm_aapcs_vfpcc <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) { 131; CHECK-LABEL: test_vmlaq_n_s32: 132; CHECK: @ %bb.0: @ %entry 133; CHECK-NEXT: vmla.u32 q0, q1, r0 134; CHECK-NEXT: bx lr 135entry: 136 %.splatinsert = insertelement <4 x i32> undef, i32 %c, i32 0 137 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 138 %0 = mul <4 x i32> %.splat, %b 139 %1 = add <4 x i32> %0, %a 140 ret <4 x i32> %1 141} 142 143define arm_aapcs_vfpcc <16 x i8> @test_vmlaq_n_u8(<16 x i8> %a, <16 x i8> %b, i8 zeroext %c) { 144; CHECK-LABEL: test_vmlaq_n_u8: 145; CHECK: @ %bb.0: @ %entry 146; CHECK-NEXT: vmla.u8 q0, q1, r0 147; CHECK-NEXT: bx lr 148entry: 149 %.splatinsert = insertelement <16 x i8> undef, i8 %c, i32 0 150 %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer 151 %0 = mul <16 x i8> %.splat, %b 152 %1 = add <16 x i8> %0, %a 153 ret <16 x i8> %1 154} 155 156define arm_aapcs_vfpcc <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c) { 157; CHECK-LABEL: test_vmlaq_n_u16: 158; CHECK: @ %bb.0: @ %entry 159; CHECK-NEXT: vmla.u16 q0, q1, r0 160; CHECK-NEXT: bx lr 161entry: 162 %.splatinsert = insertelement <8 x i16> undef, i16 %c, i32 0 163 %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer 164 %0 = mul <8 x i16> %.splat, %b 165 %1 = add <8 x i16> %0, %a 166 ret <8 x i16> %1 167} 168 169define arm_aapcs_vfpcc <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) { 170; CHECK-LABEL: test_vmlaq_n_u32: 171; CHECK: @ %bb.0: @ %entry 172; CHECK-NEXT: vmla.u32 q0, q1, r0 173; CHECK-NEXT: bx lr 174entry: 175 %.splatinsert = insertelement <4 x i32> undef, i32 %c, i32 0 176 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 177 %0 = mul <4 x i32> %.splat, %b 178 %1 = add <4 x i32> %0, %a 179 ret <4 x i32> %1 180} 181 182define arm_aapcs_vfpcc <16 x i8> @test_vmlasq_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c) { 183; CHECK-LABEL: test_vmlasq_n_s8: 184; CHECK: @ %bb.0: @ %entry 185; CHECK-NEXT: vmlas.u8 q1, q0, r0 186; CHECK-NEXT: vmov q0, q1 187; CHECK-NEXT: bx lr 188entry: 189 %0 = mul <16 x i8> %b, %a 190 %.splatinsert = insertelement <16 x i8> undef, i8 %c, i32 0 191 %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer 192 %1 = add <16 x i8> %.splat, %0 193 ret <16 x i8> %1 194} 195 196define arm_aapcs_vfpcc <8 x i16> @test_vmlasq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) { 197; CHECK-LABEL: test_vmlasq_n_s16: 198; CHECK: @ %bb.0: @ %entry 199; CHECK-NEXT: vmlas.u16 q1, q0, r0 200; CHECK-NEXT: vmov q0, q1 201; CHECK-NEXT: bx lr 202entry: 203 %0 = mul <8 x i16> %b, %a 204 %.splatinsert = insertelement <8 x i16> undef, i16 %c, i32 0 205 %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer 206 %1 = add <8 x i16> %.splat, %0 207 ret <8 x i16> %1 208} 209 210define arm_aapcs_vfpcc <4 x i32> @test_vmlasq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) { 211; CHECK-LABEL: test_vmlasq_n_s32: 212; CHECK: @ %bb.0: @ %entry 213; CHECK-NEXT: vmlas.u32 q1, q0, r0 214; CHECK-NEXT: vmov q0, q1 215; CHECK-NEXT: bx lr 216entry: 217 %0 = mul <4 x i32> %b, %a 218 %.splatinsert = insertelement <4 x i32> undef, i32 %c, i32 0 219 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 220 %1 = add <4 x i32> %.splat, %0 221 ret <4 x i32> %1 222} 223 224define arm_aapcs_vfpcc <16 x i8> @test_vmlasq_n_u8(<16 x i8> %a, <16 x i8> %b, i8 zeroext %c) { 225; CHECK-LABEL: test_vmlasq_n_u8: 226; CHECK: @ %bb.0: @ %entry 227; CHECK-NEXT: vmlas.u8 q1, q0, r0 228; CHECK-NEXT: vmov q0, q1 229; CHECK-NEXT: bx lr 230entry: 231 %0 = mul <16 x i8> %b, %a 232 %.splatinsert = insertelement <16 x i8> undef, i8 %c, i32 0 233 %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer 234 %1 = add <16 x i8> %.splat, %0 235 ret <16 x i8> %1 236} 237 238define arm_aapcs_vfpcc <8 x i16> @test_vmlasq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c) { 239; CHECK-LABEL: test_vmlasq_n_u16: 240; CHECK: @ %bb.0: @ %entry 241; CHECK-NEXT: vmlas.u16 q1, q0, r0 242; CHECK-NEXT: vmov q0, q1 243; CHECK-NEXT: bx lr 244entry: 245 %0 = mul <8 x i16> %b, %a 246 %.splatinsert = insertelement <8 x i16> undef, i16 %c, i32 0 247 %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer 248 %1 = add <8 x i16> %.splat, %0 249 ret <8 x i16> %1 250} 251 252define arm_aapcs_vfpcc <4 x i32> @test_vmlasq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) { 253; CHECK-LABEL: test_vmlasq_n_u32: 254; CHECK: @ %bb.0: @ %entry 255; CHECK-NEXT: vmlas.u32 q1, q0, r0 256; CHECK-NEXT: vmov q0, q1 257; CHECK-NEXT: bx lr 258entry: 259 %0 = mul <4 x i32> %b, %a 260 %.splatinsert = insertelement <4 x i32> undef, i32 %c, i32 0 261 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 262 %1 = add <4 x i32> %.splat, %0 263 ret <4 x i32> %1 264} 265 266define arm_aapcs_vfpcc <16 x i8> @test_vqdmlahq_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c) { 267; CHECK-LABEL: test_vqdmlahq_n_s8: 268; CHECK: @ %bb.0: @ %entry 269; CHECK-NEXT: vqdmlah.s8 q0, q1, r0 270; CHECK-NEXT: bx lr 271entry: 272 %0 = zext i8 %c to i32 273 %1 = tail call <16 x i8> @llvm.arm.mve.vqdmlah.v16i8(<16 x i8> %a, <16 x i8> %b, i32 %0) 274 ret <16 x i8> %1 275} 276 277define arm_aapcs_vfpcc <8 x i16> @test_vqdmlahq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) { 278; CHECK-LABEL: test_vqdmlahq_n_s16: 279; CHECK: @ %bb.0: @ %entry 280; CHECK-NEXT: vqdmlah.s16 q0, q1, r0 281; CHECK-NEXT: bx lr 282entry: 283 %0 = zext i16 %c to i32 284 %1 = tail call <8 x i16> @llvm.arm.mve.vqdmlah.v8i16(<8 x i16> %a, <8 x i16> %b, i32 %0) 285 ret <8 x i16> %1 286} 287 288define arm_aapcs_vfpcc <4 x i32> @test_vqdmlahq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) { 289; CHECK-LABEL: test_vqdmlahq_n_s32: 290; CHECK: @ %bb.0: @ %entry 291; CHECK-NEXT: vqdmlah.s32 q0, q1, r0 292; CHECK-NEXT: bx lr 293entry: 294 %0 = tail call <4 x i32> @llvm.arm.mve.vqdmlah.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %c) 295 ret <4 x i32> %0 296} 297 298define arm_aapcs_vfpcc <16 x i8> @test_vqdmlashq_n_s8(<16 x i8> %m1, <16 x i8> %m2, i8 signext %add) { 299; CHECK-LABEL: test_vqdmlashq_n_s8: 300; CHECK: @ %bb.0: @ %entry 301; CHECK-NEXT: vqdmlash.s8 q0, q1, r0 302; CHECK-NEXT: bx lr 303entry: 304 %0 = zext i8 %add to i32 305 %1 = tail call <16 x i8> @llvm.arm.mve.vqdmlash.v16i8(<16 x i8> %m1, <16 x i8> %m2, i32 %0) 306 ret <16 x i8> %1 307} 308 309define arm_aapcs_vfpcc <8 x i16> @test_vqdmlashq_n_s16(<8 x i16> %m1, <8 x i16> %m2, i16 signext %add) { 310; CHECK-LABEL: test_vqdmlashq_n_s16: 311; CHECK: @ %bb.0: @ %entry 312; CHECK-NEXT: vqdmlash.s16 q0, q1, r0 313; CHECK-NEXT: bx lr 314entry: 315 %0 = zext i16 %add to i32 316 %1 = tail call <8 x i16> @llvm.arm.mve.vqdmlash.v8i16(<8 x i16> %m1, <8 x i16> %m2, i32 %0) 317 ret <8 x i16> %1 318} 319 320define arm_aapcs_vfpcc <4 x i32> @test_vqdmlashq_n_s32(<4 x i32> %m1, <4 x i32> %m2, i32 %add) { 321; CHECK-LABEL: test_vqdmlashq_n_s32: 322; CHECK: @ %bb.0: @ %entry 323; CHECK-NEXT: vqdmlash.s32 q0, q1, r0 324; CHECK-NEXT: bx lr 325entry: 326 %0 = tail call <4 x i32> @llvm.arm.mve.vqdmlash.v4i32(<4 x i32> %m1, <4 x i32> %m2, i32 %add) 327 ret <4 x i32> %0 328} 329 330define arm_aapcs_vfpcc <16 x i8> @test_vqrdmlahq_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c) { 331; CHECK-LABEL: test_vqrdmlahq_n_s8: 332; CHECK: @ %bb.0: @ %entry 333; CHECK-NEXT: vqrdmlah.s8 q0, q1, r0 334; CHECK-NEXT: bx lr 335entry: 336 %0 = zext i8 %c to i32 337 %1 = tail call <16 x i8> @llvm.arm.mve.vqrdmlah.v16i8(<16 x i8> %a, <16 x i8> %b, i32 %0) 338 ret <16 x i8> %1 339} 340 341define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlahq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) { 342; CHECK-LABEL: test_vqrdmlahq_n_s16: 343; CHECK: @ %bb.0: @ %entry 344; CHECK-NEXT: vqrdmlah.s16 q0, q1, r0 345; CHECK-NEXT: bx lr 346entry: 347 %0 = zext i16 %c to i32 348 %1 = tail call <8 x i16> @llvm.arm.mve.vqrdmlah.v8i16(<8 x i16> %a, <8 x i16> %b, i32 %0) 349 ret <8 x i16> %1 350} 351 352define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlahq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) { 353; CHECK-LABEL: test_vqrdmlahq_n_s32: 354; CHECK: @ %bb.0: @ %entry 355; CHECK-NEXT: vqrdmlah.s32 q0, q1, r0 356; CHECK-NEXT: bx lr 357entry: 358 %0 = tail call <4 x i32> @llvm.arm.mve.vqrdmlah.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %c) 359 ret <4 x i32> %0 360} 361 362define arm_aapcs_vfpcc <16 x i8> @test_vqrdmlashq_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c) { 363; CHECK-LABEL: test_vqrdmlashq_n_s8: 364; CHECK: @ %bb.0: @ %entry 365; CHECK-NEXT: vqrdmlash.s8 q0, q1, r0 366; CHECK-NEXT: bx lr 367entry: 368 %0 = zext i8 %c to i32 369 %1 = tail call <16 x i8> @llvm.arm.mve.vqrdmlash.v16i8(<16 x i8> %a, <16 x i8> %b, i32 %0) 370 ret <16 x i8> %1 371} 372 373define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlashq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) { 374; CHECK-LABEL: test_vqrdmlashq_n_s16: 375; CHECK: @ %bb.0: @ %entry 376; CHECK-NEXT: vqrdmlash.s16 q0, q1, r0 377; CHECK-NEXT: bx lr 378entry: 379 %0 = zext i16 %c to i32 380 %1 = tail call <8 x i16> @llvm.arm.mve.vqrdmlash.v8i16(<8 x i16> %a, <8 x i16> %b, i32 %0) 381 ret <8 x i16> %1 382} 383 384define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlashq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) { 385; CHECK-LABEL: test_vqrdmlashq_n_s32: 386; CHECK: @ %bb.0: @ %entry 387; CHECK-NEXT: vqrdmlash.s32 q0, q1, r0 388; CHECK-NEXT: bx lr 389entry: 390 %0 = tail call <4 x i32> @llvm.arm.mve.vqrdmlash.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %c) 391 ret <4 x i32> %0 392} 393 394define arm_aapcs_vfpcc <8 x half> @test_vfmaq_m_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i16 zeroext %p) { 395; CHECK-LABEL: test_vfmaq_m_f16: 396; CHECK: @ %bb.0: @ %entry 397; CHECK-NEXT: vmsr p0, r0 398; CHECK-NEXT: vpst 399; CHECK-NEXT: vfmat.f16 q0, q1, q2 400; CHECK-NEXT: bx lr 401entry: 402 %0 = zext i16 %p to i32 403 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) 404 %2 = tail call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %c, <8 x half> %a, <8 x i1> %1) 405 ret <8 x half> %2 406} 407 408define arm_aapcs_vfpcc <4 x float> @test_vfmaq_m_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, i16 zeroext %p) { 409; CHECK-LABEL: test_vfmaq_m_f32: 410; CHECK: @ %bb.0: @ %entry 411; CHECK-NEXT: vmsr p0, r0 412; CHECK-NEXT: vpst 413; CHECK-NEXT: vfmat.f32 q0, q1, q2 414; CHECK-NEXT: bx lr 415entry: 416 %0 = zext i16 %p to i32 417 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 418 %2 = tail call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %c, <4 x float> %a, <4 x i1> %1) 419 ret <4 x float> %2 420} 421 422define arm_aapcs_vfpcc <8 x half> @test_vfmaq_m_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce, i16 zeroext %p) { 423; CHECK-LABEL: test_vfmaq_m_n_f16: 424; CHECK: @ %bb.0: @ %entry 425; CHECK-NEXT: vmov r1, s8 426; CHECK-NEXT: vmsr p0, r0 427; CHECK-NEXT: vpst 428; CHECK-NEXT: vfmat.f16 q0, q1, r1 429; CHECK-NEXT: bx lr 430entry: 431 %0 = bitcast float %c.coerce to i32 432 %tmp.0.extract.trunc = trunc i32 %0 to i16 433 %1 = bitcast i16 %tmp.0.extract.trunc to half 434 %.splatinsert = insertelement <8 x half> undef, half %1, i32 0 435 %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer 436 %2 = zext i16 %p to i32 437 %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2) 438 %4 = tail call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %.splat, <8 x half> %a, <8 x i1> %3) 439 ret <8 x half> %4 440} 441 442define arm_aapcs_vfpcc <4 x float> @test_vfmaq_m_n_f32(<4 x float> %a, <4 x float> %b, float %c, i16 zeroext %p) { 443; CHECK-LABEL: test_vfmaq_m_n_f32: 444; CHECK: @ %bb.0: @ %entry 445; CHECK-NEXT: vmov r1, s8 446; CHECK-NEXT: vmsr p0, r0 447; CHECK-NEXT: vpst 448; CHECK-NEXT: vfmat.f32 q0, q1, r1 449; CHECK-NEXT: bx lr 450entry: 451 %.splatinsert = insertelement <4 x float> undef, float %c, i32 0 452 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer 453 %0 = zext i16 %p to i32 454 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 455 %2 = tail call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %.splat, <4 x float> %a, <4 x i1> %1) 456 ret <4 x float> %2 457} 458 459define arm_aapcs_vfpcc <8 x half> @test_vfmasq_m_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce, i16 zeroext %p) { 460; CHECK-LABEL: test_vfmasq_m_n_f16: 461; CHECK: @ %bb.0: @ %entry 462; CHECK-NEXT: vmov r1, s8 463; CHECK-NEXT: vmsr p0, r0 464; CHECK-NEXT: vpst 465; CHECK-NEXT: vfmast.f16 q0, q1, r1 466; CHECK-NEXT: bx lr 467entry: 468 %0 = bitcast float %c.coerce to i32 469 %tmp.0.extract.trunc = trunc i32 %0 to i16 470 %1 = bitcast i16 %tmp.0.extract.trunc to half 471 %.splatinsert = insertelement <8 x half> undef, half %1, i32 0 472 %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer 473 %2 = zext i16 %p to i32 474 %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2) 475 %4 = tail call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x half> %.splat, <8 x i1> %3) 476 ret <8 x half> %4 477} 478 479define arm_aapcs_vfpcc <4 x float> @test_vfmasq_m_n_f32(<4 x float> %a, <4 x float> %b, float %c, i16 zeroext %p) { 480; CHECK-LABEL: test_vfmasq_m_n_f32: 481; CHECK: @ %bb.0: @ %entry 482; CHECK-NEXT: vmov r1, s8 483; CHECK-NEXT: vmsr p0, r0 484; CHECK-NEXT: vpst 485; CHECK-NEXT: vfmast.f32 q0, q1, r1 486; CHECK-NEXT: bx lr 487entry: 488 %.splatinsert = insertelement <4 x float> undef, float %c, i32 0 489 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer 490 %0 = zext i16 %p to i32 491 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 492 %2 = tail call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x float> %.splat, <4 x i1> %1) 493 ret <4 x float> %2 494} 495 496define arm_aapcs_vfpcc <8 x half> @test_vfmsq_m_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i16 zeroext %p) { 497; CHECK-LABEL: test_vfmsq_m_f16: 498; CHECK: @ %bb.0: @ %entry 499; CHECK-NEXT: vmsr p0, r0 500; CHECK-NEXT: vpst 501; CHECK-NEXT: vfmst.f16 q0, q1, q2 502; CHECK-NEXT: bx lr 503entry: 504 %0 = fneg <8 x half> %c 505 %1 = zext i16 %p to i32 506 %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) 507 %3 = tail call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %0, <8 x half> %a, <8 x i1> %2) 508 ret <8 x half> %3 509} 510 511define arm_aapcs_vfpcc <4 x float> @test_vfmsq_m_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, i16 zeroext %p) { 512; CHECK-LABEL: test_vfmsq_m_f32: 513; CHECK: @ %bb.0: @ %entry 514; CHECK-NEXT: vmsr p0, r0 515; CHECK-NEXT: vpst 516; CHECK-NEXT: vfmst.f32 q0, q1, q2 517; CHECK-NEXT: bx lr 518entry: 519 %0 = fneg <4 x float> %c 520 %1 = zext i16 %p to i32 521 %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) 522 %3 = tail call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %0, <4 x float> %a, <4 x i1> %2) 523 ret <4 x float> %3 524} 525 526define arm_aapcs_vfpcc <16 x i8> @test_vmlaq_m_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c, i16 zeroext %p) { 527; CHECK-LABEL: test_vmlaq_m_n_s8: 528; CHECK: @ %bb.0: @ %entry 529; CHECK-NEXT: vmsr p0, r1 530; CHECK-NEXT: vpst 531; CHECK-NEXT: vmlat.u8 q0, q1, r0 532; CHECK-NEXT: bx lr 533entry: 534 %0 = zext i8 %c to i32 535 %1 = zext i16 %p to i32 536 %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) 537 %3 = tail call <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2) 538 ret <16 x i8> %3 539} 540 541define arm_aapcs_vfpcc <8 x i16> @test_vmlaq_m_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c, i16 zeroext %p) { 542; CHECK-LABEL: test_vmlaq_m_n_s16: 543; CHECK: @ %bb.0: @ %entry 544; CHECK-NEXT: vmsr p0, r1 545; CHECK-NEXT: vpst 546; CHECK-NEXT: vmlat.u16 q0, q1, r0 547; CHECK-NEXT: bx lr 548entry: 549 %0 = zext i16 %c to i32 550 %1 = zext i16 %p to i32 551 %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) 552 %3 = tail call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2) 553 ret <8 x i16> %3 554} 555 556define arm_aapcs_vfpcc <4 x i32> @test_vmlaq_m_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) { 557; CHECK-LABEL: test_vmlaq_m_n_s32: 558; CHECK: @ %bb.0: @ %entry 559; CHECK-NEXT: vmsr p0, r1 560; CHECK-NEXT: vpst 561; CHECK-NEXT: vmlat.u32 q0, q1, r0 562; CHECK-NEXT: bx lr 563entry: 564 %0 = zext i16 %p to i32 565 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 566 %2 = tail call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1) 567 ret <4 x i32> %2 568} 569 570define arm_aapcs_vfpcc <16 x i8> @test_vmlaq_m_n_u8(<16 x i8> %a, <16 x i8> %b, i8 zeroext %c, i16 zeroext %p) { 571; CHECK-LABEL: test_vmlaq_m_n_u8: 572; CHECK: @ %bb.0: @ %entry 573; CHECK-NEXT: vmsr p0, r1 574; CHECK-NEXT: vpst 575; CHECK-NEXT: vmlat.u8 q0, q1, r0 576; CHECK-NEXT: bx lr 577entry: 578 %0 = zext i8 %c to i32 579 %1 = zext i16 %p to i32 580 %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) 581 %3 = tail call <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2) 582 ret <16 x i8> %3 583} 584 585define arm_aapcs_vfpcc <8 x i16> @test_vmlaq_m_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c, i16 zeroext %p) { 586; CHECK-LABEL: test_vmlaq_m_n_u16: 587; CHECK: @ %bb.0: @ %entry 588; CHECK-NEXT: vmsr p0, r1 589; CHECK-NEXT: vpst 590; CHECK-NEXT: vmlat.u16 q0, q1, r0 591; CHECK-NEXT: bx lr 592entry: 593 %0 = zext i16 %c to i32 594 %1 = zext i16 %p to i32 595 %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) 596 %3 = tail call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2) 597 ret <8 x i16> %3 598} 599 600define arm_aapcs_vfpcc <4 x i32> @test_vmlaq_m_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) { 601; CHECK-LABEL: test_vmlaq_m_n_u32: 602; CHECK: @ %bb.0: @ %entry 603; CHECK-NEXT: vmsr p0, r1 604; CHECK-NEXT: vpst 605; CHECK-NEXT: vmlat.u32 q0, q1, r0 606; CHECK-NEXT: bx lr 607entry: 608 %0 = zext i16 %p to i32 609 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 610 %2 = tail call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1) 611 ret <4 x i32> %2 612} 613 614define arm_aapcs_vfpcc <16 x i8> @test_vmlasq_m_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c, i16 zeroext %p) { 615; CHECK-LABEL: test_vmlasq_m_n_s8: 616; CHECK: @ %bb.0: @ %entry 617; CHECK-NEXT: vmsr p0, r1 618; CHECK-NEXT: vpst 619; CHECK-NEXT: vmlast.u8 q0, q1, r0 620; CHECK-NEXT: bx lr 621entry: 622 %0 = zext i8 %c to i32 623 %1 = zext i16 %p to i32 624 %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) 625 %3 = tail call <16 x i8> @llvm.arm.mve.vmlas.n.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2) 626 ret <16 x i8> %3 627} 628 629define arm_aapcs_vfpcc <8 x i16> @test_vmlasq_m_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c, i16 zeroext %p) { 630; CHECK-LABEL: test_vmlasq_m_n_s16: 631; CHECK: @ %bb.0: @ %entry 632; CHECK-NEXT: vmsr p0, r1 633; CHECK-NEXT: vpst 634; CHECK-NEXT: vmlast.u16 q0, q1, r0 635; CHECK-NEXT: bx lr 636entry: 637 %0 = zext i16 %c to i32 638 %1 = zext i16 %p to i32 639 %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) 640 %3 = tail call <8 x i16> @llvm.arm.mve.vmlas.n.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2) 641 ret <8 x i16> %3 642} 643 644define arm_aapcs_vfpcc <4 x i32> @test_vmlasq_m_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) { 645; CHECK-LABEL: test_vmlasq_m_n_s32: 646; CHECK: @ %bb.0: @ %entry 647; CHECK-NEXT: vmsr p0, r1 648; CHECK-NEXT: vpst 649; CHECK-NEXT: vmlast.u32 q0, q1, r0 650; CHECK-NEXT: bx lr 651entry: 652 %0 = zext i16 %p to i32 653 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 654 %2 = tail call <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1) 655 ret <4 x i32> %2 656} 657 658define arm_aapcs_vfpcc <16 x i8> @test_vmlasq_m_n_u8(<16 x i8> %a, <16 x i8> %b, i8 zeroext %c, i16 zeroext %p) { 659; CHECK-LABEL: test_vmlasq_m_n_u8: 660; CHECK: @ %bb.0: @ %entry 661; CHECK-NEXT: vmsr p0, r1 662; CHECK-NEXT: vpst 663; CHECK-NEXT: vmlast.u8 q0, q1, r0 664; CHECK-NEXT: bx lr 665entry: 666 %0 = zext i8 %c to i32 667 %1 = zext i16 %p to i32 668 %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) 669 %3 = tail call <16 x i8> @llvm.arm.mve.vmlas.n.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2) 670 ret <16 x i8> %3 671} 672 673define arm_aapcs_vfpcc <8 x i16> @test_vmlasq_m_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c, i16 zeroext %p) { 674; CHECK-LABEL: test_vmlasq_m_n_u16: 675; CHECK: @ %bb.0: @ %entry 676; CHECK-NEXT: vmsr p0, r1 677; CHECK-NEXT: vpst 678; CHECK-NEXT: vmlast.u16 q0, q1, r0 679; CHECK-NEXT: bx lr 680entry: 681 %0 = zext i16 %c to i32 682 %1 = zext i16 %p to i32 683 %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) 684 %3 = tail call <8 x i16> @llvm.arm.mve.vmlas.n.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2) 685 ret <8 x i16> %3 686} 687 688define arm_aapcs_vfpcc <4 x i32> @test_vmlasq_m_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) { 689; CHECK-LABEL: test_vmlasq_m_n_u32: 690; CHECK: @ %bb.0: @ %entry 691; CHECK-NEXT: vmsr p0, r1 692; CHECK-NEXT: vpst 693; CHECK-NEXT: vmlast.u32 q0, q1, r0 694; CHECK-NEXT: bx lr 695entry: 696 %0 = zext i16 %p to i32 697 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 698 %2 = tail call <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1) 699 ret <4 x i32> %2 700} 701 702define arm_aapcs_vfpcc <16 x i8> @test_vqdmlahq_m_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c, i16 zeroext %p) { 703; CHECK-LABEL: test_vqdmlahq_m_n_s8: 704; CHECK: @ %bb.0: @ %entry 705; CHECK-NEXT: vmsr p0, r1 706; CHECK-NEXT: vpst 707; CHECK-NEXT: vqdmlaht.s8 q0, q1, r0 708; CHECK-NEXT: bx lr 709entry: 710 %0 = zext i8 %c to i32 711 %1 = zext i16 %p to i32 712 %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) 713 %3 = tail call <16 x i8> @llvm.arm.mve.vqdmlah.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2) 714 ret <16 x i8> %3 715} 716 717define arm_aapcs_vfpcc <8 x i16> @test_vqdmlahq_m_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c, i16 zeroext %p) { 718; CHECK-LABEL: test_vqdmlahq_m_n_s16: 719; CHECK: @ %bb.0: @ %entry 720; CHECK-NEXT: vmsr p0, r1 721; CHECK-NEXT: vpst 722; CHECK-NEXT: vqdmlaht.s16 q0, q1, r0 723; CHECK-NEXT: bx lr 724entry: 725 %0 = zext i16 %c to i32 726 %1 = zext i16 %p to i32 727 %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) 728 %3 = tail call <8 x i16> @llvm.arm.mve.vqdmlah.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2) 729 ret <8 x i16> %3 730} 731 732define arm_aapcs_vfpcc <4 x i32> @test_vqdmlahq_m_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) { 733; CHECK-LABEL: test_vqdmlahq_m_n_s32: 734; CHECK: @ %bb.0: @ %entry 735; CHECK-NEXT: vmsr p0, r1 736; CHECK-NEXT: vpst 737; CHECK-NEXT: vqdmlaht.s32 q0, q1, r0 738; CHECK-NEXT: bx lr 739entry: 740 %0 = zext i16 %p to i32 741 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 742 %2 = tail call <4 x i32> @llvm.arm.mve.vqdmlah.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1) 743 ret <4 x i32> %2 744} 745 746define arm_aapcs_vfpcc <16 x i8> @test_vqdmlashq_m_n_s8(<16 x i8> %m1, <16 x i8> %m2, i8 signext %add, i16 zeroext %p) { 747; CHECK-LABEL: test_vqdmlashq_m_n_s8: 748; CHECK: @ %bb.0: @ %entry 749; CHECK-NEXT: vmsr p0, r1 750; CHECK-NEXT: vpst 751; CHECK-NEXT: vqdmlasht.s8 q0, q1, r0 752; CHECK-NEXT: bx lr 753entry: 754 %0 = zext i8 %add to i32 755 %1 = zext i16 %p to i32 756 %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) 757 %3 = tail call <16 x i8> @llvm.arm.mve.vqdmlash.predicated.v16i8.v16i1(<16 x i8> %m1, <16 x i8> %m2, i32 %0, <16 x i1> %2) 758 ret <16 x i8> %3 759} 760 761define arm_aapcs_vfpcc <8 x i16> @test_vqdmlashq_m_n_s16(<8 x i16> %m1, <8 x i16> %m2, i16 signext %add, i16 zeroext %p) { 762; CHECK-LABEL: test_vqdmlashq_m_n_s16: 763; CHECK: @ %bb.0: @ %entry 764; CHECK-NEXT: vmsr p0, r1 765; CHECK-NEXT: vpst 766; CHECK-NEXT: vqdmlasht.s16 q0, q1, r0 767; CHECK-NEXT: bx lr 768entry: 769 %0 = zext i16 %add to i32 770 %1 = zext i16 %p to i32 771 %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) 772 %3 = tail call <8 x i16> @llvm.arm.mve.vqdmlash.predicated.v8i16.v8i1(<8 x i16> %m1, <8 x i16> %m2, i32 %0, <8 x i1> %2) 773 ret <8 x i16> %3 774} 775 776define arm_aapcs_vfpcc <4 x i32> @test_vqdmlashq_m_n_s32(<4 x i32> %m1, <4 x i32> %m2, i32 %add, i16 zeroext %p) { 777; CHECK-LABEL: test_vqdmlashq_m_n_s32: 778; CHECK: @ %bb.0: @ %entry 779; CHECK-NEXT: vmsr p0, r1 780; CHECK-NEXT: vpst 781; CHECK-NEXT: vqdmlasht.s32 q0, q1, r0 782; CHECK-NEXT: bx lr 783entry: 784 %0 = zext i16 %p to i32 785 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 786 %2 = tail call <4 x i32> @llvm.arm.mve.vqdmlash.predicated.v4i32.v4i1(<4 x i32> %m1, <4 x i32> %m2, i32 %add, <4 x i1> %1) 787 ret <4 x i32> %2 788} 789 790define arm_aapcs_vfpcc <16 x i8> @test_vqrdmlahq_m_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c, i16 zeroext %p) { 791; CHECK-LABEL: test_vqrdmlahq_m_n_s8: 792; CHECK: @ %bb.0: @ %entry 793; CHECK-NEXT: vmsr p0, r1 794; CHECK-NEXT: vpst 795; CHECK-NEXT: vqrdmlaht.s8 q0, q1, r0 796; CHECK-NEXT: bx lr 797entry: 798 %0 = zext i8 %c to i32 799 %1 = zext i16 %p to i32 800 %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) 801 %3 = tail call <16 x i8> @llvm.arm.mve.vqrdmlah.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2) 802 ret <16 x i8> %3 803} 804 805define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlahq_m_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c, i16 zeroext %p) { 806; CHECK-LABEL: test_vqrdmlahq_m_n_s16: 807; CHECK: @ %bb.0: @ %entry 808; CHECK-NEXT: vmsr p0, r1 809; CHECK-NEXT: vpst 810; CHECK-NEXT: vqrdmlaht.s16 q0, q1, r0 811; CHECK-NEXT: bx lr 812entry: 813 %0 = zext i16 %c to i32 814 %1 = zext i16 %p to i32 815 %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) 816 %3 = tail call <8 x i16> @llvm.arm.mve.vqrdmlah.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2) 817 ret <8 x i16> %3 818} 819 820define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlahq_m_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) { 821; CHECK-LABEL: test_vqrdmlahq_m_n_s32: 822; CHECK: @ %bb.0: @ %entry 823; CHECK-NEXT: vmsr p0, r1 824; CHECK-NEXT: vpst 825; CHECK-NEXT: vqrdmlaht.s32 q0, q1, r0 826; CHECK-NEXT: bx lr 827entry: 828 %0 = zext i16 %p to i32 829 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 830 %2 = tail call <4 x i32> @llvm.arm.mve.vqrdmlah.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1) 831 ret <4 x i32> %2 832} 833 834define arm_aapcs_vfpcc <16 x i8> @test_vqrdmlashq_m_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c, i16 zeroext %p) { 835; CHECK-LABEL: test_vqrdmlashq_m_n_s8: 836; CHECK: @ %bb.0: @ %entry 837; CHECK-NEXT: vmsr p0, r1 838; CHECK-NEXT: vpst 839; CHECK-NEXT: vqrdmlasht.s8 q0, q1, r0 840; CHECK-NEXT: bx lr 841entry: 842 %0 = zext i8 %c to i32 843 %1 = zext i16 %p to i32 844 %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) 845 %3 = tail call <16 x i8> @llvm.arm.mve.vqrdmlash.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2) 846 ret <16 x i8> %3 847} 848 849define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlashq_m_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c, i16 zeroext %p) { 850; CHECK-LABEL: test_vqrdmlashq_m_n_s16: 851; CHECK: @ %bb.0: @ %entry 852; CHECK-NEXT: vmsr p0, r1 853; CHECK-NEXT: vpst 854; CHECK-NEXT: vqrdmlasht.s16 q0, q1, r0 855; CHECK-NEXT: bx lr 856entry: 857 %0 = zext i16 %c to i32 858 %1 = zext i16 %p to i32 859 %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) 860 %3 = tail call <8 x i16> @llvm.arm.mve.vqrdmlash.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2) 861 ret <8 x i16> %3 862} 863 864define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlashq_m_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) { 865; CHECK-LABEL: test_vqrdmlashq_m_n_s32: 866; CHECK: @ %bb.0: @ %entry 867; CHECK-NEXT: vmsr p0, r1 868; CHECK-NEXT: vpst 869; CHECK-NEXT: vqrdmlasht.s32 q0, q1, r0 870; CHECK-NEXT: bx lr 871entry: 872 %0 = zext i16 %p to i32 873 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 874 %2 = tail call <4 x i32> @llvm.arm.mve.vqrdmlash.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1) 875 ret <4 x i32> %2 876} 877 878declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) 879declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) 880declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) 881 882declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>) 883declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) 884declare <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x half>, <8 x i1>) 885declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>) 886declare <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>) 887declare <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>) 888declare <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) 889declare <16 x i8> @llvm.arm.mve.vmlas.n.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>) 890declare <8 x i16> @llvm.arm.mve.vmlas.n.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>) 891declare <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) 892declare <16 x i8> @llvm.arm.mve.vqdmlah.v16i8(<16 x i8>, <16 x i8>, i32) 893declare <8 x i16> @llvm.arm.mve.vqdmlah.v8i16(<8 x i16>, <8 x i16>, i32) 894declare <4 x i32> @llvm.arm.mve.vqdmlah.v4i32(<4 x i32>, <4 x i32>, i32) 895declare <16 x i8> @llvm.arm.mve.vqdmlash.v16i8(<16 x i8>, <16 x i8>, i32) 896declare <8 x i16> @llvm.arm.mve.vqdmlash.v8i16(<8 x i16>, <8 x i16>, i32) 897declare <4 x i32> @llvm.arm.mve.vqdmlash.v4i32(<4 x i32>, <4 x i32>, i32) 898declare <16 x i8> @llvm.arm.mve.vqrdmlah.v16i8(<16 x i8>, <16 x i8>, i32) 899declare <8 x i16> @llvm.arm.mve.vqrdmlah.v8i16(<8 x i16>, <8 x i16>, i32) 900declare <4 x i32> @llvm.arm.mve.vqrdmlah.v4i32(<4 x i32>, <4 x i32>, i32) 901declare <16 x i8> @llvm.arm.mve.vqrdmlash.v16i8(<16 x i8>, <16 x i8>, i32) 902declare <8 x i16> @llvm.arm.mve.vqrdmlash.v8i16(<8 x i16>, <8 x i16>, i32) 903declare <4 x i32> @llvm.arm.mve.vqrdmlash.v4i32(<4 x i32>, <4 x i32>, i32) 904declare <16 x i8> @llvm.arm.mve.vqdmlah.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>) 905declare <8 x i16> @llvm.arm.mve.vqdmlah.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>) 906declare <4 x i32> @llvm.arm.mve.vqdmlah.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) 907declare <16 x i8> @llvm.arm.mve.vqdmlash.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>) 908declare <8 x i16> @llvm.arm.mve.vqdmlash.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>) 909declare <4 x i32> @llvm.arm.mve.vqdmlash.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) 910declare <16 x i8> @llvm.arm.mve.vqrdmlah.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>) 911declare <8 x i16> @llvm.arm.mve.vqrdmlah.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>) 912declare <4 x i32> @llvm.arm.mve.vqrdmlah.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) 913declare <16 x i8> @llvm.arm.mve.vqrdmlash.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>) 914declare <8 x i16> @llvm.arm.mve.vqrdmlash.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>) 915declare <4 x i32> @llvm.arm.mve.vqrdmlash.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) 916