1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s 3 4define arm_aapcs_vfpcc <16 x i8> @test_vidupq_n_u8(i32 %a) { 5; CHECK-LABEL: test_vidupq_n_u8: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: vidup.u8 q0, r0, #4 8; CHECK-NEXT: bx lr 9entry: 10 %0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32 %a, i32 4) 11 %1 = extractvalue { <16 x i8>, i32 } %0, 0 12 ret <16 x i8> %1 13} 14 15define arm_aapcs_vfpcc <8 x i16> @test_vidupq_n_u16(i32 %a) { 16; CHECK-LABEL: test_vidupq_n_u16: 17; CHECK: @ %bb.0: @ %entry 18; CHECK-NEXT: vidup.u16 q0, r0, #1 19; CHECK-NEXT: bx lr 20entry: 21 %0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 %a, i32 1) 22 %1 = extractvalue { <8 x i16>, i32 } %0, 0 23 ret <8 x i16> %1 24} 25 26define arm_aapcs_vfpcc <4 x i32> @test_vidupq_n_u32(i32 %a) { 27; CHECK-LABEL: test_vidupq_n_u32: 28; CHECK: @ %bb.0: @ %entry 29; CHECK-NEXT: vidup.u32 q0, r0, #4 30; CHECK-NEXT: bx lr 31entry: 32 %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 %a, i32 4) 33 %1 = extractvalue { <4 x i32>, i32 } %0, 0 34 ret <4 x i32> %1 35} 36 37define arm_aapcs_vfpcc <16 x i8> @test_vddupq_n_u8(i32 %a) { 38; CHECK-LABEL: test_vddupq_n_u8: 39; CHECK: @ %bb.0: @ %entry 40; CHECK-NEXT: vddup.u8 q0, r0, #2 41; CHECK-NEXT: bx lr 42entry: 43 %0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.v16i8(i32 %a, i32 2) 44 %1 = extractvalue { <16 x i8>, i32 } %0, 0 45 ret <16 x i8> %1 46} 47 48define arm_aapcs_vfpcc <8 x i16> @test_vddupq_n_u16(i32 %a) { 49; CHECK-LABEL: test_vddupq_n_u16: 50; CHECK: @ %bb.0: @ %entry 51; CHECK-NEXT: vddup.u16 q0, r0, #4 52; CHECK-NEXT: bx lr 53entry: 54 %0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.v8i16(i32 %a, i32 4) 55 %1 = extractvalue { <8 x i16>, i32 } %0, 0 56 ret <8 x i16> %1 57} 58 59define arm_aapcs_vfpcc <4 x i32> @test_vddupq_n_u32(i32 %a) { 60; CHECK-LABEL: test_vddupq_n_u32: 61; CHECK: @ %bb.0: @ %entry 62; CHECK-NEXT: vddup.u32 q0, r0, #2 63; CHECK-NEXT: bx lr 64entry: 65 %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.v4i32(i32 %a, i32 2) 66 %1 = extractvalue { <4 x i32>, i32 } %0, 0 67 ret <4 x i32> %1 68} 69 70define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_n_u8(i32 %a, i32 %b) { 71; CHECK-LABEL: test_viwdupq_n_u8: 72; CHECK: @ %bb.0: @ %entry 73; CHECK-NEXT: viwdup.u8 q0, r0, r1, #4 74; CHECK-NEXT: bx lr 75entry: 76 %0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.v16i8(i32 %a, i32 %b, i32 4) 77 %1 = extractvalue { <16 x i8>, i32 } %0, 0 78 ret <16 x i8> %1 79} 80 81define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_n_u16(i32 %a, i32 %b) { 82; CHECK-LABEL: test_viwdupq_n_u16: 83; CHECK: @ %bb.0: @ %entry 84; CHECK-NEXT: viwdup.u16 q0, r0, r1, #2 85; CHECK-NEXT: bx lr 86entry: 87 %0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.v8i16(i32 %a, i32 %b, i32 2) 88 %1 = extractvalue { <8 x i16>, i32 } %0, 0 89 ret <8 x i16> %1 90} 91 92define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_n_u32(i32 %a, i32 %b) { 93; CHECK-LABEL: test_viwdupq_n_u32: 94; CHECK: @ %bb.0: @ %entry 95; CHECK-NEXT: viwdup.u32 q0, r0, r1, #8 96; CHECK-NEXT: bx lr 97entry: 98 %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32 %a, i32 %b, i32 8) 99 %1 = extractvalue { <4 x i32>, i32 } %0, 0 100 ret <4 x i32> %1 101} 102 103define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_n_u8(i32 %a, i32 %b) { 104; CHECK-LABEL: test_vdwdupq_n_u8: 105; CHECK: @ %bb.0: @ %entry 106; CHECK-NEXT: vdwdup.u8 q0, r0, r1, #4 107; CHECK-NEXT: bx lr 108entry: 109 %0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.v16i8(i32 %a, i32 %b, i32 4) 110 %1 = extractvalue { <16 x i8>, i32 } %0, 0 111 ret <16 x i8> %1 112} 113 114define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_n_u16(i32 %a, i32 %b) { 115; CHECK-LABEL: test_vdwdupq_n_u16: 116; CHECK: @ %bb.0: @ %entry 117; CHECK-NEXT: vdwdup.u16 q0, r0, r1, #8 118; CHECK-NEXT: bx lr 119entry: 120 %0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.v8i16(i32 %a, i32 %b, i32 8) 121 %1 = extractvalue { <8 x i16>, i32 } %0, 0 122 ret <8 x i16> %1 123} 124 125define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_n_u32(i32 %a, i32 %b) { 126; CHECK-LABEL: test_vdwdupq_n_u32: 127; CHECK: @ %bb.0: @ %entry 128; CHECK-NEXT: vdwdup.u32 q0, r0, r1, #1 129; CHECK-NEXT: bx lr 130entry: 131 %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.v4i32(i32 %a, i32 %b, i32 1) 132 %1 = extractvalue { <4 x i32>, i32 } %0, 0 133 ret <4 x i32> %1 134} 135 136define arm_aapcs_vfpcc <16 x i8> @test_vidupq_wb_u8(i32* nocapture %a) { 137; CHECK-LABEL: test_vidupq_wb_u8: 138; CHECK: @ %bb.0: @ %entry 139; CHECK-NEXT: ldr r2, [r0] 140; CHECK-NEXT: vidup.u8 q0, r2, #8 141; CHECK-NEXT: str r2, [r0] 142; CHECK-NEXT: bx lr 143entry: 144 %0 = load i32, i32* %a, align 4 145 %1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32 %0, i32 8) 146 %2 = extractvalue { <16 x i8>, i32 } %1, 1 147 store i32 %2, i32* %a, align 4 148 %3 = extractvalue { <16 x i8>, i32 } %1, 0 149 ret <16 x i8> %3 150} 151 152define arm_aapcs_vfpcc <8 x i16> @test_vidupq_wb_u16(i32* nocapture %a) { 153; CHECK-LABEL: test_vidupq_wb_u16: 154; CHECK: @ %bb.0: @ %entry 155; CHECK-NEXT: ldr r2, [r0] 156; CHECK-NEXT: vidup.u16 q0, r2, #1 157; CHECK-NEXT: str r2, [r0] 158; CHECK-NEXT: bx lr 159entry: 160 %0 = load i32, i32* %a, align 4 161 %1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 %0, i32 1) 162 %2 = extractvalue { <8 x i16>, i32 } %1, 1 163 store i32 %2, i32* %a, align 4 164 %3 = extractvalue { <8 x i16>, i32 } %1, 0 165 ret <8 x i16> %3 166} 167 168define arm_aapcs_vfpcc <4 x i32> @test_vidupq_wb_u32(i32* nocapture %a) { 169; CHECK-LABEL: test_vidupq_wb_u32: 170; CHECK: @ %bb.0: @ %entry 171; CHECK-NEXT: ldr r2, [r0] 172; CHECK-NEXT: vidup.u32 q0, r2, #4 173; CHECK-NEXT: str r2, [r0] 174; CHECK-NEXT: bx lr 175entry: 176 %0 = load i32, i32* %a, align 4 177 %1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 %0, i32 4) 178 %2 = extractvalue { <4 x i32>, i32 } %1, 1 179 store i32 %2, i32* %a, align 4 180 %3 = extractvalue { <4 x i32>, i32 } %1, 0 181 ret <4 x i32> %3 182} 183 184define arm_aapcs_vfpcc <16 x i8> @test_vddupq_wb_u8(i32* nocapture %a) { 185; CHECK-LABEL: test_vddupq_wb_u8: 186; CHECK: @ %bb.0: @ %entry 187; CHECK-NEXT: ldr r2, [r0] 188; CHECK-NEXT: vddup.u8 q0, r2, #2 189; CHECK-NEXT: str r2, [r0] 190; CHECK-NEXT: bx lr 191entry: 192 %0 = load i32, i32* %a, align 4 193 %1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.v16i8(i32 %0, i32 2) 194 %2 = extractvalue { <16 x i8>, i32 } %1, 1 195 store i32 %2, i32* %a, align 4 196 %3 = extractvalue { <16 x i8>, i32 } %1, 0 197 ret <16 x i8> %3 198} 199 200define arm_aapcs_vfpcc <8 x i16> @test_vddupq_wb_u16(i32* nocapture %a) { 201; CHECK-LABEL: test_vddupq_wb_u16: 202; CHECK: @ %bb.0: @ %entry 203; CHECK-NEXT: ldr r2, [r0] 204; CHECK-NEXT: vddup.u16 q0, r2, #8 205; CHECK-NEXT: str r2, [r0] 206; CHECK-NEXT: bx lr 207entry: 208 %0 = load i32, i32* %a, align 4 209 %1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.v8i16(i32 %0, i32 8) 210 %2 = extractvalue { <8 x i16>, i32 } %1, 1 211 store i32 %2, i32* %a, align 4 212 %3 = extractvalue { <8 x i16>, i32 } %1, 0 213 ret <8 x i16> %3 214} 215 216define arm_aapcs_vfpcc <4 x i32> @test_vddupq_wb_u32(i32* nocapture %a) { 217; CHECK-LABEL: test_vddupq_wb_u32: 218; CHECK: @ %bb.0: @ %entry 219; CHECK-NEXT: ldr r2, [r0] 220; CHECK-NEXT: vddup.u32 q0, r2, #2 221; CHECK-NEXT: str r2, [r0] 222; CHECK-NEXT: bx lr 223entry: 224 %0 = load i32, i32* %a, align 4 225 %1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.v4i32(i32 %0, i32 2) 226 %2 = extractvalue { <4 x i32>, i32 } %1, 1 227 store i32 %2, i32* %a, align 4 228 %3 = extractvalue { <4 x i32>, i32 } %1, 0 229 ret <4 x i32> %3 230} 231 232define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_wb_u8(i32* nocapture %a, i32 %b) { 233; CHECK-LABEL: test_vdwdupq_wb_u8: 234; CHECK: @ %bb.0: @ %entry 235; CHECK-NEXT: ldr r2, [r0] 236; CHECK-NEXT: vdwdup.u8 q0, r2, r1, #4 237; CHECK-NEXT: str r2, [r0] 238; CHECK-NEXT: bx lr 239entry: 240 %0 = load i32, i32* %a, align 4 241 %1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.v16i8(i32 %0, i32 %b, i32 4) 242 %2 = extractvalue { <16 x i8>, i32 } %1, 1 243 store i32 %2, i32* %a, align 4 244 %3 = extractvalue { <16 x i8>, i32 } %1, 0 245 ret <16 x i8> %3 246} 247 248define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_wb_u16(i32* nocapture %a, i32 %b) { 249; CHECK-LABEL: test_vdwdupq_wb_u16: 250; CHECK: @ %bb.0: @ %entry 251; CHECK-NEXT: ldr r2, [r0] 252; CHECK-NEXT: vdwdup.u16 q0, r2, r1, #4 253; CHECK-NEXT: str r2, [r0] 254; CHECK-NEXT: bx lr 255entry: 256 %0 = load i32, i32* %a, align 4 257 %1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.v8i16(i32 %0, i32 %b, i32 4) 258 %2 = extractvalue { <8 x i16>, i32 } %1, 1 259 store i32 %2, i32* %a, align 4 260 %3 = extractvalue { <8 x i16>, i32 } %1, 0 261 ret <8 x i16> %3 262} 263 264define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_wb_u8(i32* nocapture %a, i32 %b) { 265; CHECK-LABEL: test_viwdupq_wb_u8: 266; CHECK: @ %bb.0: @ %entry 267; CHECK-NEXT: ldr r2, [r0] 268; CHECK-NEXT: viwdup.u8 q0, r2, r1, #1 269; CHECK-NEXT: str r2, [r0] 270; CHECK-NEXT: bx lr 271entry: 272 %0 = load i32, i32* %a, align 4 273 %1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.v16i8(i32 %0, i32 %b, i32 1) 274 %2 = extractvalue { <16 x i8>, i32 } %1, 1 275 store i32 %2, i32* %a, align 4 276 %3 = extractvalue { <16 x i8>, i32 } %1, 0 277 ret <16 x i8> %3 278} 279 280define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_wb_u16(i32* nocapture %a, i32 %b) { 281; CHECK-LABEL: test_viwdupq_wb_u16: 282; CHECK: @ %bb.0: @ %entry 283; CHECK-NEXT: ldr r2, [r0] 284; CHECK-NEXT: viwdup.u16 q0, r2, r1, #1 285; CHECK-NEXT: str r2, [r0] 286; CHECK-NEXT: bx lr 287entry: 288 %0 = load i32, i32* %a, align 4 289 %1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.v8i16(i32 %0, i32 %b, i32 1) 290 %2 = extractvalue { <8 x i16>, i32 } %1, 1 291 store i32 %2, i32* %a, align 4 292 %3 = extractvalue { <8 x i16>, i32 } %1, 0 293 ret <8 x i16> %3 294} 295 296define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_wb_u32(i32* nocapture %a, i32 %b) { 297; CHECK-LABEL: test_viwdupq_wb_u32: 298; CHECK: @ %bb.0: @ %entry 299; CHECK-NEXT: ldr r2, [r0] 300; CHECK-NEXT: viwdup.u32 q0, r2, r1, #8 301; CHECK-NEXT: str r2, [r0] 302; CHECK-NEXT: bx lr 303entry: 304 %0 = load i32, i32* %a, align 4 305 %1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32 %0, i32 %b, i32 8) 306 %2 = extractvalue { <4 x i32>, i32 } %1, 1 307 store i32 %2, i32* %a, align 4 308 %3 = extractvalue { <4 x i32>, i32 } %1, 0 309 ret <4 x i32> %3 310} 311 312define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_wb_u32(i32* nocapture %a, i32 %b) { 313; CHECK-LABEL: test_vdwdupq_wb_u32: 314; CHECK: @ %bb.0: @ %entry 315; CHECK-NEXT: ldr r2, [r0] 316; CHECK-NEXT: vdwdup.u32 q0, r2, r1, #2 317; CHECK-NEXT: str r2, [r0] 318; CHECK-NEXT: bx lr 319entry: 320 %0 = load i32, i32* %a, align 4 321 %1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.v4i32(i32 %0, i32 %b, i32 2) 322 %2 = extractvalue { <4 x i32>, i32 } %1, 1 323 store i32 %2, i32* %a, align 4 324 %3 = extractvalue { <4 x i32>, i32 } %1, 0 325 ret <4 x i32> %3 326} 327 328define arm_aapcs_vfpcc <16 x i8> @test_vidupq_m_n_u8(<16 x i8> %inactive, i32 %a, i16 zeroext %p) { 329; CHECK-LABEL: test_vidupq_m_n_u8: 330; CHECK: @ %bb.0: @ %entry 331; CHECK-NEXT: vmsr p0, r1 332; CHECK-NEXT: vpst 333; CHECK-NEXT: vidupt.u8 q0, r0, #8 334; CHECK-NEXT: bx lr 335entry: 336 %0 = zext i16 %p to i32 337 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) 338 %2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 8, <16 x i1> %1) 339 %3 = extractvalue { <16 x i8>, i32 } %2, 0 340 ret <16 x i8> %3 341} 342 343define arm_aapcs_vfpcc <8 x i16> @test_vidupq_m_n_u16(<8 x i16> %inactive, i32 %a, i16 zeroext %p) { 344; CHECK-LABEL: test_vidupq_m_n_u16: 345; CHECK: @ %bb.0: @ %entry 346; CHECK-NEXT: vmsr p0, r1 347; CHECK-NEXT: vpst 348; CHECK-NEXT: vidupt.u16 q0, r0, #8 349; CHECK-NEXT: bx lr 350entry: 351 %0 = zext i16 %p to i32 352 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) 353 %2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 8, <8 x i1> %1) 354 %3 = extractvalue { <8 x i16>, i32 } %2, 0 355 ret <8 x i16> %3 356} 357 358define arm_aapcs_vfpcc <4 x i32> @test_vidupq_m_n_u32(<4 x i32> %inactive, i32 %a, i16 zeroext %p) { 359; CHECK-LABEL: test_vidupq_m_n_u32: 360; CHECK: @ %bb.0: @ %entry 361; CHECK-NEXT: vmsr p0, r1 362; CHECK-NEXT: vpst 363; CHECK-NEXT: vidupt.u32 q0, r0, #2 364; CHECK-NEXT: bx lr 365entry: 366 %0 = zext i16 %p to i32 367 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 368 %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 2, <4 x i1> %1) 369 %3 = extractvalue { <4 x i32>, i32 } %2, 0 370 ret <4 x i32> %3 371} 372 373define arm_aapcs_vfpcc <16 x i8> @test_vddupq_m_n_u8(<16 x i8> %inactive, i32 %a, i16 zeroext %p) { 374; CHECK-LABEL: test_vddupq_m_n_u8: 375; CHECK: @ %bb.0: @ %entry 376; CHECK-NEXT: vmsr p0, r1 377; CHECK-NEXT: vpst 378; CHECK-NEXT: vddupt.u8 q0, r0, #8 379; CHECK-NEXT: bx lr 380entry: 381 %0 = zext i16 %p to i32 382 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) 383 %2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 8, <16 x i1> %1) 384 %3 = extractvalue { <16 x i8>, i32 } %2, 0 385 ret <16 x i8> %3 386} 387 388define arm_aapcs_vfpcc <8 x i16> @test_vddupq_m_n_u16(<8 x i16> %inactive, i32 %a, i16 zeroext %p) { 389; CHECK-LABEL: test_vddupq_m_n_u16: 390; CHECK: @ %bb.0: @ %entry 391; CHECK-NEXT: vmsr p0, r1 392; CHECK-NEXT: vpst 393; CHECK-NEXT: vddupt.u16 q0, r0, #2 394; CHECK-NEXT: bx lr 395entry: 396 %0 = zext i16 %p to i32 397 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) 398 %2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 2, <8 x i1> %1) 399 %3 = extractvalue { <8 x i16>, i32 } %2, 0 400 ret <8 x i16> %3 401} 402 403define arm_aapcs_vfpcc <4 x i32> @test_vddupq_m_n_u32(<4 x i32> %inactive, i32 %a, i16 zeroext %p) { 404; CHECK-LABEL: test_vddupq_m_n_u32: 405; CHECK: @ %bb.0: @ %entry 406; CHECK-NEXT: vmsr p0, r1 407; CHECK-NEXT: vpst 408; CHECK-NEXT: vddupt.u32 q0, r0, #8 409; CHECK-NEXT: bx lr 410entry: 411 %0 = zext i16 %p to i32 412 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 413 %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 8, <4 x i1> %1) 414 %3 = extractvalue { <4 x i32>, i32 } %2, 0 415 ret <4 x i32> %3 416} 417 418define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_m_n_u8(<16 x i8> %inactive, i32 %a, i32 %b, i16 zeroext %p) { 419; CHECK-LABEL: test_viwdupq_m_n_u8: 420; CHECK: @ %bb.0: @ %entry 421; CHECK-NEXT: vmsr p0, r2 422; CHECK-NEXT: vpst 423; CHECK-NEXT: viwdupt.u8 q0, r0, r1, #8 424; CHECK-NEXT: bx lr 425entry: 426 %0 = zext i16 %p to i32 427 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) 428 %2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 %b, i32 8, <16 x i1> %1) 429 %3 = extractvalue { <16 x i8>, i32 } %2, 0 430 ret <16 x i8> %3 431} 432 433define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_m_n_u16(<8 x i16> %inactive, i32 %a, i32 %b, i16 zeroext %p) { 434; CHECK-LABEL: test_viwdupq_m_n_u16: 435; CHECK: @ %bb.0: @ %entry 436; CHECK-NEXT: vmsr p0, r2 437; CHECK-NEXT: vpst 438; CHECK-NEXT: viwdupt.u16 q0, r0, r1, #8 439; CHECK-NEXT: bx lr 440entry: 441 %0 = zext i16 %p to i32 442 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) 443 %2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 %b, i32 8, <8 x i1> %1) 444 %3 = extractvalue { <8 x i16>, i32 } %2, 0 445 ret <8 x i16> %3 446} 447 448define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_m_n_u32(<4 x i32> %inactive, i32 %a, i32 %b, i16 zeroext %p) { 449; CHECK-LABEL: test_viwdupq_m_n_u32: 450; CHECK: @ %bb.0: @ %entry 451; CHECK-NEXT: vmsr p0, r2 452; CHECK-NEXT: vpst 453; CHECK-NEXT: viwdupt.u32 q0, r0, r1, #4 454; CHECK-NEXT: bx lr 455entry: 456 %0 = zext i16 %p to i32 457 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 458 %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 %b, i32 4, <4 x i1> %1) 459 %3 = extractvalue { <4 x i32>, i32 } %2, 0 460 ret <4 x i32> %3 461} 462 463define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_m_n_u8(<16 x i8> %inactive, i32 %a, i32 %b, i16 zeroext %p) { 464; CHECK-LABEL: test_vdwdupq_m_n_u8: 465; CHECK: @ %bb.0: @ %entry 466; CHECK-NEXT: vmsr p0, r2 467; CHECK-NEXT: vpst 468; CHECK-NEXT: vdwdupt.u8 q0, r0, r1, #1 469; CHECK-NEXT: bx lr 470entry: 471 %0 = zext i16 %p to i32 472 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) 473 %2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 %b, i32 1, <16 x i1> %1) 474 %3 = extractvalue { <16 x i8>, i32 } %2, 0 475 ret <16 x i8> %3 476} 477 478define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_m_n_u16(<8 x i16> %inactive, i32 %a, i32 %b, i16 zeroext %p) { 479; CHECK-LABEL: test_vdwdupq_m_n_u16: 480; CHECK: @ %bb.0: @ %entry 481; CHECK-NEXT: vmsr p0, r2 482; CHECK-NEXT: vpst 483; CHECK-NEXT: vdwdupt.u16 q0, r0, r1, #2 484; CHECK-NEXT: bx lr 485entry: 486 %0 = zext i16 %p to i32 487 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) 488 %2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 %b, i32 2, <8 x i1> %1) 489 %3 = extractvalue { <8 x i16>, i32 } %2, 0 490 ret <8 x i16> %3 491} 492 493define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_m_n_u32(<4 x i32> %inactive, i32 %a, i32 %b, i16 zeroext %p) { 494; CHECK-LABEL: test_vdwdupq_m_n_u32: 495; CHECK: @ %bb.0: @ %entry 496; CHECK-NEXT: vmsr p0, r2 497; CHECK-NEXT: vpst 498; CHECK-NEXT: vdwdupt.u32 q0, r0, r1, #4 499; CHECK-NEXT: bx lr 500entry: 501 %0 = zext i16 %p to i32 502 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 503 %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 %b, i32 4, <4 x i1> %1) 504 %3 = extractvalue { <4 x i32>, i32 } %2, 0 505 ret <4 x i32> %3 506} 507 508define arm_aapcs_vfpcc <16 x i8> @test_vidupq_m_wb_u8(<16 x i8> %inactive, i32* nocapture %a, i16 zeroext %p) { 509; CHECK-LABEL: test_vidupq_m_wb_u8: 510; CHECK: @ %bb.0: @ %entry 511; CHECK-NEXT: ldr r2, [r0] 512; CHECK-NEXT: vmsr p0, r1 513; CHECK-NEXT: vpst 514; CHECK-NEXT: vidupt.u8 q0, r2, #8 515; CHECK-NEXT: str r2, [r0] 516; CHECK-NEXT: bx lr 517entry: 518 %0 = load i32, i32* %a, align 4 519 %1 = zext i16 %p to i32 520 %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) 521 %3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 8, <16 x i1> %2) 522 %4 = extractvalue { <16 x i8>, i32 } %3, 1 523 store i32 %4, i32* %a, align 4 524 %5 = extractvalue { <16 x i8>, i32 } %3, 0 525 ret <16 x i8> %5 526} 527 528define arm_aapcs_vfpcc <8 x i16> @test_vidupq_m_wb_u16(<8 x i16> %inactive, i32* nocapture %a, i16 zeroext %p) { 529; CHECK-LABEL: test_vidupq_m_wb_u16: 530; CHECK: @ %bb.0: @ %entry 531; CHECK-NEXT: ldr r2, [r0] 532; CHECK-NEXT: vmsr p0, r1 533; CHECK-NEXT: vpst 534; CHECK-NEXT: vidupt.u16 q0, r2, #2 535; CHECK-NEXT: str r2, [r0] 536; CHECK-NEXT: bx lr 537entry: 538 %0 = load i32, i32* %a, align 4 539 %1 = zext i16 %p to i32 540 %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) 541 %3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 2, <8 x i1> %2) 542 %4 = extractvalue { <8 x i16>, i32 } %3, 1 543 store i32 %4, i32* %a, align 4 544 %5 = extractvalue { <8 x i16>, i32 } %3, 0 545 ret <8 x i16> %5 546} 547 548define arm_aapcs_vfpcc <4 x i32> @test_vidupq_m_wb_u32(<4 x i32> %inactive, i32* nocapture %a, i16 zeroext %p) { 549; CHECK-LABEL: test_vidupq_m_wb_u32: 550; CHECK: @ %bb.0: @ %entry 551; CHECK-NEXT: ldr r2, [r0] 552; CHECK-NEXT: vmsr p0, r1 553; CHECK-NEXT: vpst 554; CHECK-NEXT: vidupt.u32 q0, r2, #8 555; CHECK-NEXT: str r2, [r0] 556; CHECK-NEXT: bx lr 557entry: 558 %0 = load i32, i32* %a, align 4 559 %1 = zext i16 %p to i32 560 %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) 561 %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 8, <4 x i1> %2) 562 %4 = extractvalue { <4 x i32>, i32 } %3, 1 563 store i32 %4, i32* %a, align 4 564 %5 = extractvalue { <4 x i32>, i32 } %3, 0 565 ret <4 x i32> %5 566} 567 568define arm_aapcs_vfpcc <16 x i8> @test_vddupq_m_wb_u8(<16 x i8> %inactive, i32* nocapture %a, i16 zeroext %p) { 569; CHECK-LABEL: test_vddupq_m_wb_u8: 570; CHECK: @ %bb.0: @ %entry 571; CHECK-NEXT: ldr r2, [r0] 572; CHECK-NEXT: vmsr p0, r1 573; CHECK-NEXT: vpst 574; CHECK-NEXT: vddupt.u8 q0, r2, #1 575; CHECK-NEXT: str r2, [r0] 576; CHECK-NEXT: bx lr 577entry: 578 %0 = load i32, i32* %a, align 4 579 %1 = zext i16 %p to i32 580 %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) 581 %3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 1, <16 x i1> %2) 582 %4 = extractvalue { <16 x i8>, i32 } %3, 1 583 store i32 %4, i32* %a, align 4 584 %5 = extractvalue { <16 x i8>, i32 } %3, 0 585 ret <16 x i8> %5 586} 587 588define arm_aapcs_vfpcc <8 x i16> @test_vddupq_m_wb_u16(<8 x i16> %inactive, i32* nocapture %a, i16 zeroext %p) { 589; CHECK-LABEL: test_vddupq_m_wb_u16: 590; CHECK: @ %bb.0: @ %entry 591; CHECK-NEXT: ldr r2, [r0] 592; CHECK-NEXT: vmsr p0, r1 593; CHECK-NEXT: vpst 594; CHECK-NEXT: vddupt.u16 q0, r2, #1 595; CHECK-NEXT: str r2, [r0] 596; CHECK-NEXT: bx lr 597entry: 598 %0 = load i32, i32* %a, align 4 599 %1 = zext i16 %p to i32 600 %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) 601 %3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 1, <8 x i1> %2) 602 %4 = extractvalue { <8 x i16>, i32 } %3, 1 603 store i32 %4, i32* %a, align 4 604 %5 = extractvalue { <8 x i16>, i32 } %3, 0 605 ret <8 x i16> %5 606} 607 608define arm_aapcs_vfpcc <4 x i32> @test_vddupq_m_wb_u32(<4 x i32> %inactive, i32* nocapture %a, i16 zeroext %p) { 609; CHECK-LABEL: test_vddupq_m_wb_u32: 610; CHECK: @ %bb.0: @ %entry 611; CHECK-NEXT: ldr r2, [r0] 612; CHECK-NEXT: vmsr p0, r1 613; CHECK-NEXT: vpst 614; CHECK-NEXT: vddupt.u32 q0, r2, #4 615; CHECK-NEXT: str r2, [r0] 616; CHECK-NEXT: bx lr 617entry: 618 %0 = load i32, i32* %a, align 4 619 %1 = zext i16 %p to i32 620 %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) 621 %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 4, <4 x i1> %2) 622 %4 = extractvalue { <4 x i32>, i32 } %3, 1 623 store i32 %4, i32* %a, align 4 624 %5 = extractvalue { <4 x i32>, i32 } %3, 0 625 ret <4 x i32> %5 626} 627 628define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_m_wb_u8(<16 x i8> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) { 629; CHECK-LABEL: test_viwdupq_m_wb_u8: 630; CHECK: @ %bb.0: @ %entry 631; CHECK-NEXT: ldr.w r12, [r0] 632; CHECK-NEXT: vmsr p0, r2 633; CHECK-NEXT: vpst 634; CHECK-NEXT: viwdupt.u8 q0, r12, r1, #8 635; CHECK-NEXT: str.w r12, [r0] 636; CHECK-NEXT: bx lr 637entry: 638 %0 = load i32, i32* %a, align 4 639 %1 = zext i16 %p to i32 640 %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) 641 %3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 %b, i32 8, <16 x i1> %2) 642 %4 = extractvalue { <16 x i8>, i32 } %3, 1 643 store i32 %4, i32* %a, align 4 644 %5 = extractvalue { <16 x i8>, i32 } %3, 0 645 ret <16 x i8> %5 646} 647 648define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_m_wb_u16(<8 x i16> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) { 649; CHECK-LABEL: test_viwdupq_m_wb_u16: 650; CHECK: @ %bb.0: @ %entry 651; CHECK-NEXT: ldr.w r12, [r0] 652; CHECK-NEXT: vmsr p0, r2 653; CHECK-NEXT: vpst 654; CHECK-NEXT: viwdupt.u16 q0, r12, r1, #8 655; CHECK-NEXT: str.w r12, [r0] 656; CHECK-NEXT: bx lr 657entry: 658 %0 = load i32, i32* %a, align 4 659 %1 = zext i16 %p to i32 660 %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) 661 %3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 %b, i32 8, <8 x i1> %2) 662 %4 = extractvalue { <8 x i16>, i32 } %3, 1 663 store i32 %4, i32* %a, align 4 664 %5 = extractvalue { <8 x i16>, i32 } %3, 0 665 ret <8 x i16> %5 666} 667 668define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_m_wb_u32(<4 x i32> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) { 669; CHECK-LABEL: test_viwdupq_m_wb_u32: 670; CHECK: @ %bb.0: @ %entry 671; CHECK-NEXT: ldr.w r12, [r0] 672; CHECK-NEXT: vmsr p0, r2 673; CHECK-NEXT: vpst 674; CHECK-NEXT: viwdupt.u32 q0, r12, r1, #4 675; CHECK-NEXT: str.w r12, [r0] 676; CHECK-NEXT: bx lr 677entry: 678 %0 = load i32, i32* %a, align 4 679 %1 = zext i16 %p to i32 680 %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) 681 %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 %b, i32 4, <4 x i1> %2) 682 %4 = extractvalue { <4 x i32>, i32 } %3, 1 683 store i32 %4, i32* %a, align 4 684 %5 = extractvalue { <4 x i32>, i32 } %3, 0 685 ret <4 x i32> %5 686} 687 688define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_m_wb_u8(<16 x i8> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) { 689; CHECK-LABEL: test_vdwdupq_m_wb_u8: 690; CHECK: @ %bb.0: @ %entry 691; CHECK-NEXT: ldr.w r12, [r0] 692; CHECK-NEXT: vmsr p0, r2 693; CHECK-NEXT: vpst 694; CHECK-NEXT: vdwdupt.u8 q0, r12, r1, #1 695; CHECK-NEXT: str.w r12, [r0] 696; CHECK-NEXT: bx lr 697entry: 698 %0 = load i32, i32* %a, align 4 699 %1 = zext i16 %p to i32 700 %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) 701 %3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 %b, i32 1, <16 x i1> %2) 702 %4 = extractvalue { <16 x i8>, i32 } %3, 1 703 store i32 %4, i32* %a, align 4 704 %5 = extractvalue { <16 x i8>, i32 } %3, 0 705 ret <16 x i8> %5 706} 707 708define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_m_wb_u16(<8 x i16> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) { 709; CHECK-LABEL: test_vdwdupq_m_wb_u16: 710; CHECK: @ %bb.0: @ %entry 711; CHECK-NEXT: ldr.w r12, [r0] 712; CHECK-NEXT: vmsr p0, r2 713; CHECK-NEXT: vpst 714; CHECK-NEXT: vdwdupt.u16 q0, r12, r1, #4 715; CHECK-NEXT: str.w r12, [r0] 716; CHECK-NEXT: bx lr 717entry: 718 %0 = load i32, i32* %a, align 4 719 %1 = zext i16 %p to i32 720 %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) 721 %3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 %b, i32 4, <8 x i1> %2) 722 %4 = extractvalue { <8 x i16>, i32 } %3, 1 723 store i32 %4, i32* %a, align 4 724 %5 = extractvalue { <8 x i16>, i32 } %3, 0 725 ret <8 x i16> %5 726} 727 728define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_m_wb_u32(<4 x i32> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) { 729; CHECK-LABEL: test_vdwdupq_m_wb_u32: 730; CHECK: @ %bb.0: @ %entry 731; CHECK-NEXT: ldr.w r12, [r0] 732; CHECK-NEXT: vmsr p0, r2 733; CHECK-NEXT: vpst 734; CHECK-NEXT: vdwdupt.u32 q0, r12, r1, #4 735; CHECK-NEXT: str.w r12, [r0] 736; CHECK-NEXT: bx lr 737entry: 738 %0 = load i32, i32* %a, align 4 739 %1 = zext i16 %p to i32 740 %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) 741 %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 %b, i32 4, <4 x i1> %2) 742 %4 = extractvalue { <4 x i32>, i32 } %3, 1 743 store i32 %4, i32* %a, align 4 744 %5 = extractvalue { <4 x i32>, i32 } %3, 0 745 ret <4 x i32> %5 746} 747 748declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) 749declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) 750declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) 751 752declare { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32, i32) 753declare { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32, i32) 754declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32) 755declare { <16 x i8>, i32 } @llvm.arm.mve.vddup.v16i8(i32, i32) 756declare { <8 x i16>, i32 } @llvm.arm.mve.vddup.v8i16(i32, i32) 757declare { <4 x i32>, i32 } @llvm.arm.mve.vddup.v4i32(i32, i32) 758declare { <16 x i8>, i32 } @llvm.arm.mve.viwdup.v16i8(i32, i32, i32) 759declare { <8 x i16>, i32 } @llvm.arm.mve.viwdup.v8i16(i32, i32, i32) 760declare { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32, i32, i32) 761declare { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.v16i8(i32, i32, i32) 762declare { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.v8i16(i32, i32, i32) 763declare { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.v4i32(i32, i32, i32) 764declare { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, <16 x i1>) 765declare { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>) 766declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, <4 x i1>) 767declare { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, <16 x i1>) 768declare { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>) 769declare { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, <4 x i1>) 770declare { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, i32, <16 x i1>) 771declare { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, i32, <8 x i1>) 772declare { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, i32, <4 x i1>) 773declare { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, i32, <16 x i1>) 774declare { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, i32, <8 x i1>) 775declare { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, i32, <4 x i1>) 776