1; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon -verify-machineinstrs %s -o - \ 2; RUN: | FileCheck %s 3 4define <8 x i8> @v_dup8(i8 %A) nounwind { 5;CHECK-LABEL: v_dup8: 6;CHECK: vdup.8 7 %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0 8 %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1 9 %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2 10 %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3 11 %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4 12 %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5 13 %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6 14 %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7 15 ret <8 x i8> %tmp8 16} 17 18define <4 x i16> @v_dup16(i16 %A) nounwind { 19;CHECK-LABEL: v_dup16: 20;CHECK: vdup.16 21 %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0 22 %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1 23 %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2 24 %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3 25 ret <4 x i16> %tmp4 26} 27 28define <2 x i32> @v_dup32(i32 %A) nounwind { 29;CHECK-LABEL: v_dup32: 30;CHECK: vdup.32 31 %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0 32 %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1 33 ret <2 x i32> %tmp2 34} 35 36define <2 x float> @v_dupfloat(float %A) nounwind { 37;CHECK-LABEL: v_dupfloat: 38;CHECK: vdup.32 39 %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0 40 %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1 41 ret <2 x float> %tmp2 42} 43 44define <16 x i8> @v_dupQ8(i8 %A) nounwind { 45;CHECK-LABEL: v_dupQ8: 46;CHECK: vdup.8 47 %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0 48 %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1 49 %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2 50 %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3 51 %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4 52 %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5 53 %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6 54 %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7 55 %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8 56 %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9 57 %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10 58 %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11 59 %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12 60 %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13 61 %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14 62 %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15 63 ret <16 x i8> %tmp16 64} 65 66define <8 x i16> @v_dupQ16(i16 %A) nounwind { 67;CHECK-LABEL: v_dupQ16: 68;CHECK: vdup.16 69 %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0 70 %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1 71 %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2 72 %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3 73 %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4 74 %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5 75 %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6 76 %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7 77 ret <8 x i16> %tmp8 78} 79 80define <4 x i32> @v_dupQ32(i32 %A) nounwind { 81;CHECK-LABEL: v_dupQ32: 82;CHECK: vdup.32 83 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0 84 %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1 85 %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2 86 %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3 87 ret <4 x i32> %tmp4 88} 89 90define <4 x float> @v_dupQfloat(float %A) nounwind { 91;CHECK-LABEL: v_dupQfloat: 92;CHECK: vdup.32 93 %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0 94 %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1 95 %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2 96 %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3 97 ret <4 x float> %tmp4 98} 99 100; Check to make sure it works with shuffles, too. 101 102define <8 x i8> @v_shuffledup8(i8 %A) nounwind { 103;CHECK-LABEL: v_shuffledup8: 104;CHECK: vdup.8 105 %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0 106 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer 107 ret <8 x i8> %tmp2 108} 109 110define <4 x i16> @v_shuffledup16(i16 %A) nounwind { 111;CHECK-LABEL: v_shuffledup16: 112;CHECK: vdup.16 113 %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0 114 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer 115 ret <4 x i16> %tmp2 116} 117 118define <2 x i32> @v_shuffledup32(i32 %A) nounwind { 119;CHECK-LABEL: v_shuffledup32: 120;CHECK: vdup.32 121 %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0 122 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer 123 ret <2 x i32> %tmp2 124} 125 126define <2 x float> @v_shuffledupfloat(float %A) nounwind { 127;CHECK-LABEL: v_shuffledupfloat: 128;CHECK: vdup.32 129 %tmp1 = insertelement <2 x float> undef, float %A, i32 0 130 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer 131 ret <2 x float> %tmp2 132} 133 134define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind { 135;CHECK-LABEL: v_shuffledupQ8: 136;CHECK: vdup.8 137 %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0 138 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer 139 ret <16 x i8> %tmp2 140} 141 142define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind { 143;CHECK-LABEL: v_shuffledupQ16: 144;CHECK: vdup.16 145 %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0 146 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer 147 ret <8 x i16> %tmp2 148} 149 150define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind { 151;CHECK-LABEL: v_shuffledupQ32: 152;CHECK: vdup.32 153 %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0 154 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer 155 ret <4 x i32> %tmp2 156} 157 158define <4 x float> @v_shuffledupQfloat(float %A) nounwind { 159;CHECK-LABEL: v_shuffledupQfloat: 160;CHECK: vdup.32 161 %tmp1 = insertelement <4 x float> undef, float %A, i32 0 162 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer 163 ret <4 x float> %tmp2 164} 165 166define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind { 167;CHECK-LABEL: vduplane8: 168;CHECK: vdup.8 169 %tmp1 = load <8 x i8>, <8 x i8>* %A 170 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 171 ret <8 x i8> %tmp2 172} 173 174define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind { 175;CHECK-LABEL: vduplane16: 176;CHECK: vdup.16 177 %tmp1 = load <4 x i16>, <4 x i16>* %A 178 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 179 ret <4 x i16> %tmp2 180} 181 182define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind { 183;CHECK-LABEL: vduplane32: 184;CHECK: vdup.32 185 %tmp1 = load <2 x i32>, <2 x i32>* %A 186 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 > 187 ret <2 x i32> %tmp2 188} 189 190define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind { 191;CHECK-LABEL: vduplanefloat: 192;CHECK: vdup.32 193 %tmp1 = load <2 x float>, <2 x float>* %A 194 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 > 195 ret <2 x float> %tmp2 196} 197 198define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind { 199;CHECK-LABEL: vduplaneQ8: 200;CHECK: vdup.8 201 %tmp1 = load <8 x i8>, <8 x i8>* %A 202 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 203 ret <16 x i8> %tmp2 204} 205 206define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind { 207;CHECK-LABEL: vduplaneQ16: 208;CHECK: vdup.16 209 %tmp1 = load <4 x i16>, <4 x i16>* %A 210 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 211 ret <8 x i16> %tmp2 212} 213 214define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind { 215;CHECK-LABEL: vduplaneQ32: 216;CHECK: vdup.32 217 %tmp1 = load <2 x i32>, <2 x i32>* %A 218 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 219 ret <4 x i32> %tmp2 220} 221 222define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind { 223;CHECK-LABEL: vduplaneQfloat: 224;CHECK: vdup.32 225 %tmp1 = load <2 x float>, <2 x float>* %A 226 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 227 ret <4 x float> %tmp2 228} 229 230define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone { 231entry: 232 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1> 233 ret <2 x i64> %0 234} 235 236define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone { 237entry: 238 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0> 239 ret <2 x i64> %0 240} 241 242define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone { 243entry: 244 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1> 245 ret <2 x double> %0 246} 247 248define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone { 249entry: 250 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0> 251 ret <2 x double> %0 252} 253 254; Radar 7373643 255;CHECK-LABEL: redundantVdup: 256;CHECK: vmov.i8 257;CHECK-NOT: vdup.8 258;CHECK: vstr 259define void @redundantVdup(<8 x i8>* %ptr) nounwind { 260 %1 = insertelement <8 x i8> undef, i8 -128, i32 0 261 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer 262 store <8 x i8> %2, <8 x i8>* %ptr, align 8 263 ret void 264} 265 266define <4 x i32> @tdupi(i32 %x, i32 %y) { 267;CHECK-LABEL: tdupi: 268;CHECK: vdup.32 269 %1 = insertelement <4 x i32> undef, i32 %x, i32 0 270 %2 = insertelement <4 x i32> %1, i32 %x, i32 1 271 %3 = insertelement <4 x i32> %2, i32 %x, i32 2 272 %4 = insertelement <4 x i32> %3, i32 %y, i32 3 273 ret <4 x i32> %4 274} 275 276define <4 x float> @tdupf(float %x, float %y) { 277;CHECK-LABEL: tdupf: 278;CHECK: vdup.32 279 %1 = insertelement <4 x float> undef, float %x, i32 0 280 %2 = insertelement <4 x float> %1, float %x, i32 1 281 %3 = insertelement <4 x float> %2, float %x, i32 2 282 %4 = insertelement <4 x float> %3, float %y, i32 3 283 ret <4 x float> %4 284} 285 286; This test checks that when splatting an element from a vector into another, 287; the value isn't moved out to GPRs first. 288define <4 x i32> @tduplane(<4 x i32> %invec) { 289;CHECK-LABEL: tduplane: 290;CHECK-NOT: vmov {{.*}}, d16[1] 291;CHECK: vdup.32 {{.*}}, d16[1] 292 %in = extractelement <4 x i32> %invec, i32 1 293 %1 = insertelement <4 x i32> undef, i32 %in, i32 0 294 %2 = insertelement <4 x i32> %1, i32 %in, i32 1 295 %3 = insertelement <4 x i32> %2, i32 %in, i32 2 296 %4 = insertelement <4 x i32> %3, i32 255, i32 3 297 ret <4 x i32> %4 298} 299 300define <2 x float> @check_f32(<4 x float> %v) nounwind { 301;CHECK-LABEL: check_f32: 302;CHECK: vdup.32 {{.*}}, d{{..}}[1] 303 %x = extractelement <4 x float> %v, i32 3 304 %1 = insertelement <2 x float> undef, float %x, i32 0 305 %2 = insertelement <2 x float> %1, float %x, i32 1 306 ret <2 x float> %2 307} 308 309define <2 x i32> @check_i32(<4 x i32> %v) nounwind { 310;CHECK-LABEL: check_i32: 311;CHECK: vdup.32 {{.*}}, d{{..}}[1] 312 %x = extractelement <4 x i32> %v, i32 3 313 %1 = insertelement <2 x i32> undef, i32 %x, i32 0 314 %2 = insertelement <2 x i32> %1, i32 %x, i32 1 315 ret <2 x i32> %2 316} 317 318define <4 x i16> @check_i16(<8 x i16> %v) nounwind { 319;CHECK-LABEL: check_i16: 320;CHECK: vdup.16 {{.*}}, d{{..}}[3] 321 %x = extractelement <8 x i16> %v, i32 3 322 %1 = insertelement <4 x i16> undef, i16 %x, i32 0 323 %2 = insertelement <4 x i16> %1, i16 %x, i32 1 324 ret <4 x i16> %2 325} 326 327define <8 x i8> @check_i8(<16 x i8> %v) nounwind { 328;CHECK-LABEL: check_i8: 329;CHECK: vdup.8 {{.*}}, d{{..}}[3] 330 %x = extractelement <16 x i8> %v, i32 3 331 %1 = insertelement <8 x i8> undef, i8 %x, i32 0 332 %2 = insertelement <8 x i8> %1, i8 %x, i32 1 333 ret <8 x i8> %2 334} 335 336; Check that an SPR splat produces a vdup. 337 338define <2 x float> @check_spr_splat2(<2 x float> %p, i16 %q) { 339;CHECK-LABEL: check_spr_splat2: 340;CHECK: vdup.32 d 341 %conv = sitofp i16 %q to float 342 %splat.splatinsert = insertelement <2 x float> undef, float %conv, i32 0 343 %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> undef, <2 x i32> zeroinitializer 344 %sub = fsub <2 x float> %splat.splat, %p 345 ret <2 x float> %sub 346} 347 348define <4 x float> @check_spr_splat4(<4 x float> %p, i16 %q) { 349;CHECK-LABEL: check_spr_splat4: 350;CHECK: vld1.16 351 %conv = sitofp i16 %q to float 352 %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 0 353 %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer 354 %sub = fsub <4 x float> %splat.splat, %p 355 ret <4 x float> %sub 356} 357; Same codegen as above test; scalar is splatted using vld1, so shuffle index is irrelevant. 358define <4 x float> @check_spr_splat4_lane1(<4 x float> %p, i16 %q) { 359;CHECK-LABEL: check_spr_splat4_lane1: 360;CHECK: vld1.16 361 %conv = sitofp i16 %q to float 362 %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 1 363 %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 364 %sub = fsub <4 x float> %splat.splat, %p 365 ret <4 x float> %sub 366} 367 368; Also make sure we don't barf on variable-index extractelts, where we almost 369; could have generated a vdup. 370 371define <8 x i8> @check_i8_varidx(<16 x i8> %v, i32 %idx) { 372; CHECK-LABEL: check_i8_varidx: 373; CHECK: mov r[[FP:[0-9]+]], sp 374; CHECK: ldr r[[IDX:[0-9]+]], [r[[FP]], #4] 375; CHECK: mov r[[SPCOPY:[0-9]+]], sp 376; CHECK: and r[[MASKED_IDX:[0-9]+]], r[[IDX]], #15 377; CHECK: vst1.64 {d{{.*}}, d{{.*}}}, [r[[SPCOPY]]:128], r[[MASKED_IDX]] 378; CHECK: vld1.8 {d{{.*}}[]}, [r[[SPCOPY]]] 379 %x = extractelement <16 x i8> %v, i32 %idx 380 %1 = insertelement <8 x i8> undef, i8 %x, i32 0 381 %2 = insertelement <8 x i8> %1, i8 %x, i32 1 382 ret <8 x i8> %2 383} 384