1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft -mattr=+neon -verify-machineinstrs | FileCheck %s 3 4define <8 x i8> @v_dup8(i8 %A) nounwind { 5; CHECK-LABEL: v_dup8: 6; CHECK: @ %bb.0: 7; CHECK-NEXT: vdup.8 d16, r0 8; CHECK-NEXT: vmov r0, r1, d16 9; CHECK-NEXT: mov pc, lr 10 %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0 11 %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1 12 %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2 13 %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3 14 %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4 15 %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5 16 %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6 17 %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7 18 ret <8 x i8> %tmp8 19} 20 21define <4 x i16> @v_dup16(i16 %A) nounwind { 22; CHECK-LABEL: v_dup16: 23; CHECK: @ %bb.0: 24; CHECK-NEXT: vdup.16 d16, r0 25; CHECK-NEXT: vmov r0, r1, d16 26; CHECK-NEXT: mov pc, lr 27 %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0 28 %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1 29 %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2 30 %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3 31 ret <4 x i16> %tmp4 32} 33 34define <2 x i32> @v_dup32(i32 %A) nounwind { 35; CHECK-LABEL: v_dup32: 36; CHECK: @ %bb.0: 37; CHECK-NEXT: vdup.32 d16, r0 38; CHECK-NEXT: vmov r0, r1, d16 39; CHECK-NEXT: mov pc, lr 40 %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0 41 %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1 42 ret <2 x i32> %tmp2 43} 44 45define <2 x float> @v_dupfloat(float %A) nounwind { 46; CHECK-LABEL: v_dupfloat: 47; CHECK: @ %bb.0: 48; CHECK-NEXT: vdup.32 d16, r0 49; CHECK-NEXT: vmov r0, r1, d16 50; CHECK-NEXT: mov pc, lr 51 %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0 52 %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1 53 ret <2 x float> %tmp2 54} 55 56define <16 x i8> @v_dupQ8(i8 %A) nounwind { 57; CHECK-LABEL: v_dupQ8: 58; CHECK: @ %bb.0: 59; CHECK-NEXT: vdup.8 q8, r0 60; CHECK-NEXT: vmov r0, r1, d16 61; CHECK-NEXT: vmov r2, r3, d17 62; CHECK-NEXT: mov pc, lr 63 %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0 64 %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1 65 %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2 66 %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3 67 %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4 68 %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5 69 %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6 70 %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7 71 %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8 72 %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9 73 %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10 74 %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11 75 %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12 76 %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13 77 %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14 78 %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15 79 ret <16 x i8> %tmp16 80} 81 82define <8 x i16> @v_dupQ16(i16 %A) nounwind { 83; CHECK-LABEL: v_dupQ16: 84; CHECK: @ %bb.0: 85; CHECK-NEXT: vdup.16 q8, r0 86; CHECK-NEXT: vmov r0, r1, d16 87; CHECK-NEXT: vmov r2, r3, d17 88; CHECK-NEXT: mov pc, lr 89 %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0 90 %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1 91 %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2 92 %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3 93 %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4 94 %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5 95 %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6 96 %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7 97 ret <8 x i16> %tmp8 98} 99 100define <4 x i32> @v_dupQ32(i32 %A) nounwind { 101; CHECK-LABEL: v_dupQ32: 102; CHECK: @ %bb.0: 103; CHECK-NEXT: vdup.32 q8, r0 104; CHECK-NEXT: vmov r0, r1, d16 105; CHECK-NEXT: vmov r2, r3, d17 106; CHECK-NEXT: mov pc, lr 107 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0 108 %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1 109 %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2 110 %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3 111 ret <4 x i32> %tmp4 112} 113 114define <4 x float> @v_dupQfloat(float %A) nounwind { 115; CHECK-LABEL: v_dupQfloat: 116; CHECK: @ %bb.0: 117; CHECK-NEXT: vdup.32 q8, r0 118; CHECK-NEXT: vmov r0, r1, d16 119; CHECK-NEXT: vmov r2, r3, d17 120; CHECK-NEXT: mov pc, lr 121 %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0 122 %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1 123 %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2 124 %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3 125 ret <4 x float> %tmp4 126} 127 128; Check to make sure it works with shuffles, too. 129 130define <8 x i8> @v_shuffledup8(i8 %A) nounwind { 131; CHECK-LABEL: v_shuffledup8: 132; CHECK: @ %bb.0: 133; CHECK-NEXT: vdup.8 d16, r0 134; CHECK-NEXT: vmov r0, r1, d16 135; CHECK-NEXT: mov pc, lr 136 %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0 137 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer 138 ret <8 x i8> %tmp2 139} 140 141define <4 x i16> @v_shuffledup16(i16 %A) nounwind { 142; CHECK-LABEL: v_shuffledup16: 143; CHECK: @ %bb.0: 144; CHECK-NEXT: vdup.16 d16, r0 145; CHECK-NEXT: vmov r0, r1, d16 146; CHECK-NEXT: mov pc, lr 147 %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0 148 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer 149 ret <4 x i16> %tmp2 150} 151 152define <2 x i32> @v_shuffledup32(i32 %A) nounwind { 153; CHECK-LABEL: v_shuffledup32: 154; CHECK: @ %bb.0: 155; CHECK-NEXT: vdup.32 d16, r0 156; CHECK-NEXT: vmov r0, r1, d16 157; CHECK-NEXT: mov pc, lr 158 %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0 159 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer 160 ret <2 x i32> %tmp2 161} 162 163define <2 x float> @v_shuffledupfloat(float %A) nounwind { 164; CHECK-LABEL: v_shuffledupfloat: 165; CHECK: @ %bb.0: 166; CHECK-NEXT: vdup.32 d16, r0 167; CHECK-NEXT: vmov r0, r1, d16 168; CHECK-NEXT: mov pc, lr 169 %tmp1 = insertelement <2 x float> undef, float %A, i32 0 170 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer 171 ret <2 x float> %tmp2 172} 173 174define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind { 175; CHECK-LABEL: v_shuffledupQ8: 176; CHECK: @ %bb.0: 177; CHECK-NEXT: vdup.8 q8, r0 178; CHECK-NEXT: vmov r0, r1, d16 179; CHECK-NEXT: vmov r2, r3, d17 180; CHECK-NEXT: mov pc, lr 181 %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0 182 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer 183 ret <16 x i8> %tmp2 184} 185 186define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind { 187; CHECK-LABEL: v_shuffledupQ16: 188; CHECK: @ %bb.0: 189; CHECK-NEXT: vdup.16 q8, r0 190; CHECK-NEXT: vmov r0, r1, d16 191; CHECK-NEXT: vmov r2, r3, d17 192; CHECK-NEXT: mov pc, lr 193 %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0 194 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer 195 ret <8 x i16> %tmp2 196} 197 198define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind { 199; CHECK-LABEL: v_shuffledupQ32: 200; CHECK: @ %bb.0: 201; CHECK-NEXT: vdup.32 q8, r0 202; CHECK-NEXT: vmov r0, r1, d16 203; CHECK-NEXT: vmov r2, r3, d17 204; CHECK-NEXT: mov pc, lr 205 %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0 206 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer 207 ret <4 x i32> %tmp2 208} 209 210define <4 x float> @v_shuffledupQfloat(float %A) nounwind { 211; CHECK-LABEL: v_shuffledupQfloat: 212; CHECK: @ %bb.0: 213; CHECK-NEXT: vdup.32 q8, r0 214; CHECK-NEXT: vmov r0, r1, d16 215; CHECK-NEXT: vmov r2, r3, d17 216; CHECK-NEXT: mov pc, lr 217 %tmp1 = insertelement <4 x float> undef, float %A, i32 0 218 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer 219 ret <4 x float> %tmp2 220} 221 222define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind { 223; CHECK-LABEL: vduplane8: 224; CHECK: @ %bb.0: 225; CHECK-NEXT: vldr d16, [r0] 226; CHECK-NEXT: vdup.8 d16, d16[1] 227; CHECK-NEXT: vmov r0, r1, d16 228; CHECK-NEXT: mov pc, lr 229 %tmp1 = load <8 x i8>, <8 x i8>* %A 230 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 231 ret <8 x i8> %tmp2 232} 233 234define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind { 235; CHECK-LABEL: vduplane16: 236; CHECK: @ %bb.0: 237; CHECK-NEXT: vldr d16, [r0] 238; CHECK-NEXT: vdup.16 d16, d16[1] 239; CHECK-NEXT: vmov r0, r1, d16 240; CHECK-NEXT: mov pc, lr 241 %tmp1 = load <4 x i16>, <4 x i16>* %A 242 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 243 ret <4 x i16> %tmp2 244} 245 246define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind { 247; CHECK-LABEL: vduplane32: 248; CHECK: @ %bb.0: 249; CHECK-NEXT: vldr d16, [r0] 250; CHECK-NEXT: vdup.32 d16, d16[1] 251; CHECK-NEXT: vmov r0, r1, d16 252; CHECK-NEXT: mov pc, lr 253 %tmp1 = load <2 x i32>, <2 x i32>* %A 254 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 > 255 ret <2 x i32> %tmp2 256} 257 258define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind { 259; CHECK-LABEL: vduplanefloat: 260; CHECK: @ %bb.0: 261; CHECK-NEXT: vldr d16, [r0] 262; CHECK-NEXT: vdup.32 d16, d16[1] 263; CHECK-NEXT: vmov r0, r1, d16 264; CHECK-NEXT: mov pc, lr 265 %tmp1 = load <2 x float>, <2 x float>* %A 266 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 > 267 ret <2 x float> %tmp2 268} 269 270define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind { 271; CHECK-LABEL: vduplaneQ8: 272; CHECK: @ %bb.0: 273; CHECK-NEXT: vldr d16, [r0] 274; CHECK-NEXT: vdup.8 q8, d16[1] 275; CHECK-NEXT: vmov r0, r1, d16 276; CHECK-NEXT: vmov r2, r3, d17 277; CHECK-NEXT: mov pc, lr 278 %tmp1 = load <8 x i8>, <8 x i8>* %A 279 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 280 ret <16 x i8> %tmp2 281} 282 283define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind { 284; CHECK-LABEL: vduplaneQ16: 285; CHECK: @ %bb.0: 286; CHECK-NEXT: vldr d16, [r0] 287; CHECK-NEXT: vdup.16 q8, d16[1] 288; CHECK-NEXT: vmov r0, r1, d16 289; CHECK-NEXT: vmov r2, r3, d17 290; CHECK-NEXT: mov pc, lr 291 %tmp1 = load <4 x i16>, <4 x i16>* %A 292 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 293 ret <8 x i16> %tmp2 294} 295 296define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind { 297; CHECK-LABEL: vduplaneQ32: 298; CHECK: @ %bb.0: 299; CHECK-NEXT: vldr d16, [r0] 300; CHECK-NEXT: vdup.32 q8, d16[1] 301; CHECK-NEXT: vmov r0, r1, d16 302; CHECK-NEXT: vmov r2, r3, d17 303; CHECK-NEXT: mov pc, lr 304 %tmp1 = load <2 x i32>, <2 x i32>* %A 305 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 306 ret <4 x i32> %tmp2 307} 308 309define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind { 310; CHECK-LABEL: vduplaneQfloat: 311; CHECK: @ %bb.0: 312; CHECK-NEXT: vldr d16, [r0] 313; CHECK-NEXT: vdup.32 q8, d16[1] 314; CHECK-NEXT: vmov r0, r1, d16 315; CHECK-NEXT: vmov r2, r3, d17 316; CHECK-NEXT: mov pc, lr 317 %tmp1 = load <2 x float>, <2 x float>* %A 318 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 319 ret <4 x float> %tmp2 320} 321 322define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone { 323; CHECK-LABEL: foo: 324; CHECK: @ %bb.0: @ %entry 325; CHECK-NEXT: mov r0, r2 326; CHECK-NEXT: mov r1, r3 327; CHECK-NEXT: mov pc, lr 328entry: 329 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1> 330 ret <2 x i64> %0 331} 332 333define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone { 334; CHECK-LABEL: bar: 335; CHECK: @ %bb.0: @ %entry 336; CHECK-NEXT: mov r2, r0 337; CHECK-NEXT: mov r3, r1 338; CHECK-NEXT: mov pc, lr 339entry: 340 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0> 341 ret <2 x i64> %0 342} 343 344define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone { 345; CHECK-LABEL: baz: 346; CHECK: @ %bb.0: @ %entry 347; CHECK-NEXT: mov r0, r2 348; CHECK-NEXT: mov r1, r3 349; CHECK-NEXT: mov pc, lr 350entry: 351 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1> 352 ret <2 x double> %0 353} 354 355define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone { 356; CHECK-LABEL: qux: 357; CHECK: @ %bb.0: @ %entry 358; CHECK-NEXT: mov r2, r0 359; CHECK-NEXT: mov r3, r1 360; CHECK-NEXT: mov pc, lr 361entry: 362 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0> 363 ret <2 x double> %0 364} 365 366; Radar 7373643 367define void @redundantVdup(<8 x i8>* %ptr) nounwind { 368; CHECK-LABEL: redundantVdup: 369; CHECK: @ %bb.0: 370; CHECK-NEXT: vmov.i8 d16, #0x80 371; CHECK-NEXT: vstr d16, [r0] 372; CHECK-NEXT: mov pc, lr 373 %1 = insertelement <8 x i8> undef, i8 -128, i32 0 374 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer 375 store <8 x i8> %2, <8 x i8>* %ptr, align 8 376 ret void 377} 378 379define <4 x i32> @tdupi(i32 %x, i32 %y) { 380; CHECK-LABEL: tdupi: 381; CHECK: @ %bb.0: 382; CHECK-NEXT: vdup.32 q8, r0 383; CHECK-NEXT: vmov.32 d17[1], r1 384; CHECK-NEXT: vmov r0, r1, d16 385; CHECK-NEXT: vmov r2, r3, d17 386; CHECK-NEXT: mov pc, lr 387 %1 = insertelement <4 x i32> undef, i32 %x, i32 0 388 %2 = insertelement <4 x i32> %1, i32 %x, i32 1 389 %3 = insertelement <4 x i32> %2, i32 %x, i32 2 390 %4 = insertelement <4 x i32> %3, i32 %y, i32 3 391 ret <4 x i32> %4 392} 393 394define <4 x float> @tdupf(float %x, float %y) { 395; CHECK-LABEL: tdupf: 396; CHECK: @ %bb.0: 397; CHECK-NEXT: vdup.32 q0, r0 398; CHECK-NEXT: vmov s3, r1 399; CHECK-NEXT: vmov r0, r1, d0 400; CHECK-NEXT: vmov r2, r3, d1 401; CHECK-NEXT: mov pc, lr 402 %1 = insertelement <4 x float> undef, float %x, i32 0 403 %2 = insertelement <4 x float> %1, float %x, i32 1 404 %3 = insertelement <4 x float> %2, float %x, i32 2 405 %4 = insertelement <4 x float> %3, float %y, i32 3 406 ret <4 x float> %4 407} 408 409; This test checks that when splatting an element from a vector into another, 410; the value isn't moved out to GPRs first. 411define <4 x i32> @tduplane(<4 x i32> %invec) { 412; CHECK-LABEL: tduplane: 413; CHECK: @ %bb.0: 414; CHECK-NEXT: vmov d16, r0, r1 415; CHECK-NEXT: mov r0, #255 416; CHECK-NEXT: vdup.32 q8, d16[1] 417; CHECK-NEXT: vmov.32 d17[1], r0 418; CHECK-NEXT: vmov r0, r1, d16 419; CHECK-NEXT: vmov r2, r3, d17 420; CHECK-NEXT: mov pc, lr 421 %in = extractelement <4 x i32> %invec, i32 1 422 %1 = insertelement <4 x i32> undef, i32 %in, i32 0 423 %2 = insertelement <4 x i32> %1, i32 %in, i32 1 424 %3 = insertelement <4 x i32> %2, i32 %in, i32 2 425 %4 = insertelement <4 x i32> %3, i32 255, i32 3 426 ret <4 x i32> %4 427} 428 429define <2 x float> @check_f32(<4 x float> %v) nounwind { 430; CHECK-LABEL: check_f32: 431; CHECK: @ %bb.0: 432; CHECK-NEXT: vmov d16, r2, r3 433; CHECK-NEXT: vdup.32 d16, d16[1] 434; CHECK-NEXT: vmov r0, r1, d16 435; CHECK-NEXT: mov pc, lr 436 %x = extractelement <4 x float> %v, i32 3 437 %1 = insertelement <2 x float> undef, float %x, i32 0 438 %2 = insertelement <2 x float> %1, float %x, i32 1 439 ret <2 x float> %2 440} 441 442define <2 x i32> @check_i32(<4 x i32> %v) nounwind { 443; CHECK-LABEL: check_i32: 444; CHECK: @ %bb.0: 445; CHECK-NEXT: vmov d16, r2, r3 446; CHECK-NEXT: vdup.32 d16, d16[1] 447; CHECK-NEXT: vmov r0, r1, d16 448; CHECK-NEXT: mov pc, lr 449 %x = extractelement <4 x i32> %v, i32 3 450 %1 = insertelement <2 x i32> undef, i32 %x, i32 0 451 %2 = insertelement <2 x i32> %1, i32 %x, i32 1 452 ret <2 x i32> %2 453} 454 455define <4 x i16> @check_i16(<8 x i16> %v) nounwind { 456; CHECK-LABEL: check_i16: 457; CHECK: @ %bb.0: 458; CHECK-NEXT: vmov d16, r0, r1 459; CHECK-NEXT: vdup.16 d16, d16[3] 460; CHECK-NEXT: vmov r0, r1, d16 461; CHECK-NEXT: mov pc, lr 462 %x = extractelement <8 x i16> %v, i32 3 463 %1 = insertelement <4 x i16> undef, i16 %x, i32 0 464 %2 = insertelement <4 x i16> %1, i16 %x, i32 1 465 ret <4 x i16> %2 466} 467 468define <8 x i8> @check_i8(<16 x i8> %v) nounwind { 469; CHECK-LABEL: check_i8: 470; CHECK: @ %bb.0: 471; CHECK-NEXT: vmov d16, r0, r1 472; CHECK-NEXT: vdup.8 d16, d16[3] 473; CHECK-NEXT: vmov r0, r1, d16 474; CHECK-NEXT: mov pc, lr 475 %x = extractelement <16 x i8> %v, i32 3 476 %1 = insertelement <8 x i8> undef, i8 %x, i32 0 477 %2 = insertelement <8 x i8> %1, i8 %x, i32 1 478 ret <8 x i8> %2 479} 480 481; Check that an SPR splat produces a vdup. 482 483define <2 x float> @check_spr_splat2(<2 x float> %p, i16 %q) { 484; CHECK-LABEL: check_spr_splat2: 485; CHECK: @ %bb.0: 486; CHECK-NEXT: lsl r2, r2, #16 487; CHECK-NEXT: vmov d16, r0, r1 488; CHECK-NEXT: asr r2, r2, #16 489; CHECK-NEXT: vmov s0, r2 490; CHECK-NEXT: vcvt.f32.s32 s0, s0 491; CHECK-NEXT: vdup.32 d17, d0[0] 492; CHECK-NEXT: vsub.f32 d16, d17, d16 493; CHECK-NEXT: vmov r0, r1, d16 494; CHECK-NEXT: mov pc, lr 495 %conv = sitofp i16 %q to float 496 %splat.splatinsert = insertelement <2 x float> undef, float %conv, i32 0 497 %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> undef, <2 x i32> zeroinitializer 498 %sub = fsub <2 x float> %splat.splat, %p 499 ret <2 x float> %sub 500} 501 502define <4 x float> @check_spr_splat4(<4 x float> %p, i16 %q) { 503; CHECK-LABEL: check_spr_splat4: 504; CHECK: @ %bb.0: 505; CHECK-NEXT: ldrsh r12, [sp] 506; CHECK-NEXT: vmov d17, r2, r3 507; CHECK-NEXT: vmov d16, r0, r1 508; CHECK-NEXT: vmov s0, r12 509; CHECK-NEXT: vcvt.f32.s32 s0, s0 510; CHECK-NEXT: vdup.32 q9, d0[0] 511; CHECK-NEXT: vsub.f32 q8, q9, q8 512; CHECK-NEXT: vmov r0, r1, d16 513; CHECK-NEXT: vmov r2, r3, d17 514; CHECK-NEXT: mov pc, lr 515 %conv = sitofp i16 %q to float 516 %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 0 517 %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer 518 %sub = fsub <4 x float> %splat.splat, %p 519 ret <4 x float> %sub 520} 521; Same codegen as above test; scalar is splatted using vld1, so shuffle index is irrelevant. 522define <4 x float> @check_spr_splat4_lane1(<4 x float> %p, i16 %q) { 523; CHECK-LABEL: check_spr_splat4_lane1: 524; CHECK: @ %bb.0: 525; CHECK-NEXT: ldrsh r12, [sp] 526; CHECK-NEXT: vmov d17, r2, r3 527; CHECK-NEXT: vmov d16, r0, r1 528; CHECK-NEXT: vmov s0, r12 529; CHECK-NEXT: vcvt.f32.s32 s0, s0 530; CHECK-NEXT: vdup.32 q9, d0[0] 531; CHECK-NEXT: vsub.f32 q8, q9, q8 532; CHECK-NEXT: vmov r0, r1, d16 533; CHECK-NEXT: vmov r2, r3, d17 534; CHECK-NEXT: mov pc, lr 535 %conv = sitofp i16 %q to float 536 %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 1 537 %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 538 %sub = fsub <4 x float> %splat.splat, %p 539 ret <4 x float> %sub 540} 541 542; Also make sure we don't barf on variable-index extractelts, where we almost 543; could have generated a vdup. 544 545define <8 x i8> @check_i8_varidx(<16 x i8> %v, i32 %idx) { 546; CHECK-LABEL: check_i8_varidx: 547; CHECK: @ %bb.0: 548; CHECK-NEXT: .save {r11} 549; CHECK-NEXT: push {r11} 550; CHECK-NEXT: .setfp r11, sp 551; CHECK-NEXT: mov r11, sp 552; CHECK-NEXT: .pad #28 553; CHECK-NEXT: sub sp, sp, #28 554; CHECK-NEXT: bic sp, sp, #15 555; CHECK-NEXT: ldr r12, [r11, #4] 556; CHECK-NEXT: vmov d17, r2, r3 557; CHECK-NEXT: vmov d16, r0, r1 558; CHECK-NEXT: mov r1, sp 559; CHECK-NEXT: and r0, r12, #15 560; CHECK-NEXT: vst1.64 {d16, d17}, [r1:128], r0 561; CHECK-NEXT: vld1.8 {d16[]}, [r1] 562; CHECK-NEXT: vmov r0, r1, d16 563; CHECK-NEXT: mov sp, r11 564; CHECK-NEXT: pop {r11} 565; CHECK-NEXT: mov pc, lr 566 %x = extractelement <16 x i8> %v, i32 %idx 567 %1 = insertelement <8 x i8> undef, i8 %x, i32 0 568 %2 = insertelement <8 x i8> %1, i8 %x, i32 1 569 ret <8 x i8> %2 570} 571