1; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X32 2; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X64 3 4@g16 = external global i16 5 6define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind { 7; X32-LABEL: pinsrd_1: 8; X32: ## BB#0: 9; X32-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0 10; X32-NEXT: retl 11; 12; X64-LABEL: pinsrd_1: 13; X64: ## BB#0: 14; X64-NEXT: pinsrd $1, %edi, %xmm0 15; X64-NEXT: retq 16 %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1 17 ret <4 x i32> %tmp1 18} 19 20define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind { 21; X32-LABEL: pinsrb_1: 22; X32: ## BB#0: 23; X32-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0 24; X32-NEXT: retl 25; 26; X64-LABEL: pinsrb_1: 27; X64: ## BB#0: 28; X64-NEXT: pinsrb $1, %edi, %xmm0 29; X64-NEXT: retq 30 %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1 31 ret <16 x i8> %tmp1 32} 33 34define <2 x i64> @pmovsxbd_1(i32* %p) nounwind { 35; X32-LABEL: pmovsxbd_1: 36; X32: ## BB#0: ## %entry 37; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 38; X32-NEXT: pmovsxbd (%eax), %xmm0 39; X32-NEXT: retl 40; 41; X64-LABEL: pmovsxbd_1: 42; X64: ## BB#0: ## %entry 43; X64-NEXT: pmovsxbd (%rdi), %xmm0 44; X64-NEXT: retq 45entry: 46 %0 = load i32, i32* %p, align 4 47 %1 = insertelement <4 x i32> undef, i32 %0, i32 0 48 %2 = insertelement <4 x i32> %1, i32 0, i32 1 49 %3 = insertelement <4 x i32> %2, i32 0, i32 2 50 %4 = insertelement <4 x i32> %3, i32 0, i32 3 51 %5 = bitcast <4 x i32> %4 to <16 x i8> 52 %6 = tail call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %5) nounwind readnone 53 %7 = bitcast <4 x i32> %6 to <2 x i64> 54 ret <2 x i64> %7 55} 56 57define <2 x i64> @pmovsxwd_1(i64* %p) nounwind readonly { 58; X32-LABEL: pmovsxwd_1: 59; X32: ## BB#0: ## %entry 60; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 61; X32-NEXT: pmovsxwd (%eax), %xmm0 62; X32-NEXT: retl 63; 64; X64-LABEL: pmovsxwd_1: 65; X64: ## BB#0: ## %entry 66; X64-NEXT: pmovsxwd (%rdi), %xmm0 67; X64-NEXT: retq 68entry: 69 %0 = load i64, i64* %p ; <i64> [#uses=1] 70 %tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0 ; <<2 x i64>> [#uses=1] 71 %1 = bitcast <2 x i64> %tmp2 to <8 x i16> ; <<8 x i16>> [#uses=1] 72 %2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone ; <<4 x i32>> [#uses=1] 73 %3 = bitcast <4 x i32> %2 to <2 x i64> ; <<2 x i64>> [#uses=1] 74 ret <2 x i64> %3 75} 76 77define <2 x i64> @pmovzxbq_1() nounwind { 78; X32-LABEL: pmovzxbq_1: 79; X32: ## BB#0: ## %entry 80; X32-NEXT: movl L_g16$non_lazy_ptr, %eax 81; X32-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 82; X32-NEXT: retl 83; 84; X64-LABEL: pmovzxbq_1: 85; X64: ## BB#0: ## %entry 86; X64-NEXT: movq _g16@{{.*}}(%rip), %rax 87; X64-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 88; X64-NEXT: retq 89entry: 90 %0 = load i16, i16* @g16, align 2 ; <i16> [#uses=1] 91 %1 = insertelement <8 x i16> undef, i16 %0, i32 0 ; <<8 x i16>> [#uses=1] 92 %2 = bitcast <8 x i16> %1 to <16 x i8> ; <<16 x i8>> [#uses=1] 93 %3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone ; <<2 x i64>> [#uses=1] 94 ret <2 x i64> %3 95} 96 97declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone 98declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone 99declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone 100 101define i32 @extractps_1(<4 x float> %v) nounwind { 102; X32-LABEL: extractps_1: 103; X32: ## BB#0: 104; X32-NEXT: extractps $3, %xmm0, %eax 105; X32-NEXT: retl 106; 107; X64-LABEL: extractps_1: 108; X64: ## BB#0: 109; X64-NEXT: extractps $3, %xmm0, %eax 110; X64-NEXT: retq 111 %s = extractelement <4 x float> %v, i32 3 112 %i = bitcast float %s to i32 113 ret i32 %i 114} 115define i32 @extractps_2(<4 x float> %v) nounwind { 116; X32-LABEL: extractps_2: 117; X32: ## BB#0: 118; X32-NEXT: extractps $3, %xmm0, %eax 119; X32-NEXT: retl 120; 121; X64-LABEL: extractps_2: 122; X64: ## BB#0: 123; X64-NEXT: extractps $3, %xmm0, %eax 124; X64-NEXT: retq 125 %t = bitcast <4 x float> %v to <4 x i32> 126 %s = extractelement <4 x i32> %t, i32 3 127 ret i32 %s 128} 129 130 131; The non-store form of extractps puts its result into a GPR. 132; This makes it suitable for an extract from a <4 x float> that 133; is bitcasted to i32, but unsuitable for much of anything else. 134 135define float @ext_1(<4 x float> %v) nounwind { 136; X32-LABEL: ext_1: 137; X32: ## BB#0: 138; X32-NEXT: pushl %eax 139; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 140; X32-NEXT: addss LCPI7_0, %xmm0 141; X32-NEXT: movss %xmm0, (%esp) 142; X32-NEXT: flds (%esp) 143; X32-NEXT: popl %eax 144; X32-NEXT: retl 145; 146; X64-LABEL: ext_1: 147; X64: ## BB#0: 148; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 149; X64-NEXT: addss {{.*}}(%rip), %xmm0 150; X64-NEXT: retq 151 %s = extractelement <4 x float> %v, i32 3 152 %t = fadd float %s, 1.0 153 ret float %t 154} 155define float @ext_2(<4 x float> %v) nounwind { 156; X32-LABEL: ext_2: 157; X32: ## BB#0: 158; X32-NEXT: pushl %eax 159; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 160; X32-NEXT: movss %xmm0, (%esp) 161; X32-NEXT: flds (%esp) 162; X32-NEXT: popl %eax 163; X32-NEXT: retl 164; 165; X64-LABEL: ext_2: 166; X64: ## BB#0: 167; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 168; X64-NEXT: retq 169 %s = extractelement <4 x float> %v, i32 3 170 ret float %s 171} 172define i32 @ext_3(<4 x i32> %v) nounwind { 173; X32-LABEL: ext_3: 174; X32: ## BB#0: 175; X32-NEXT: pextrd $3, %xmm0, %eax 176; X32-NEXT: retl 177; 178; X64-LABEL: ext_3: 179; X64: ## BB#0: 180; X64-NEXT: pextrd $3, %xmm0, %eax 181; X64-NEXT: retq 182 %i = extractelement <4 x i32> %v, i32 3 183 ret i32 %i 184} 185 186define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind { 187; X32-LABEL: insertps_1: 188; X32: ## BB#0: 189; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3] 190; X32-NEXT: retl 191; 192; X64-LABEL: insertps_1: 193; X64: ## BB#0: 194; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3] 195; X64-NEXT: retq 196 %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone 197 ret <4 x float> %tmp1 198} 199 200declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone 201 202; When optimizing for speed, prefer blendps over insertps even if it means we have to 203; generate a separate movss to load the scalar operand. 204define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind { 205; X32-LABEL: blendps_not_insertps_1: 206; X32: ## BB#0: 207; X32-NEXT: movss {{.*#+}} xmm1 208; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 209; X32-NEXT: retl 210; 211; X64-LABEL: blendps_not_insertps_1: 212; X64: ## BB#0: 213; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 214; X64-NEXT: retq 215 %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 216 ret <4 x float> %tmp1 217} 218 219; When optimizing for size, generate an insertps if there's a load fold opportunity. 220; The difference between i386 and x86-64 ABIs for the float operand means we should 221; generate an insertps for X32 but not for X64! 222define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind { 223; X32-LABEL: insertps_or_blendps: 224; X32: ## BB#0: 225; X32-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] 226; X32-NEXT: retl 227; 228; X64-LABEL: insertps_or_blendps: 229; X64: ## BB#0: 230; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 231; X64-NEXT: retq 232 %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 233 ret <4 x float> %tmp1 234} 235 236; An insert into the low 32-bits of a vector from the low 32-bits of another vector 237; is always just a blendps because blendps is never more expensive than insertps. 238define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind { 239; X32-LABEL: blendps_not_insertps_2: 240; X32: ## BB#0: 241; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 242; X32-NEXT: retl 243; 244; X64-LABEL: blendps_not_insertps_2: 245; X64: ## BB#0: 246; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 247; X64-NEXT: retq 248 %tmp2 = extractelement <4 x float> %t2, i32 0 249 %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0 250 ret <4 x float> %tmp1 251} 252 253define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind { 254; X32-LABEL: ptestz_1: 255; X32: ## BB#0: 256; X32-NEXT: ptest %xmm1, %xmm0 257; X32-NEXT: sete %al 258; X32-NEXT: movzbl %al, %eax 259; X32-NEXT: retl 260; 261; X64-LABEL: ptestz_1: 262; X64: ## BB#0: 263; X64-NEXT: ptest %xmm1, %xmm0 264; X64-NEXT: sete %al 265; X64-NEXT: movzbl %al, %eax 266; X64-NEXT: retq 267 %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone 268 ret i32 %tmp1 269} 270 271define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind { 272; X32-LABEL: ptestz_2: 273; X32: ## BB#0: 274; X32-NEXT: ptest %xmm1, %xmm0 275; X32-NEXT: sbbl %eax, %eax 276; X32-NEXT: andl $1, %eax 277; X32-NEXT: retl 278; 279; X64-LABEL: ptestz_2: 280; X64: ## BB#0: 281; X64-NEXT: ptest %xmm1, %xmm0 282; X64-NEXT: sbbl %eax, %eax 283; X64-NEXT: andl $1, %eax 284; X64-NEXT: retq 285 %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone 286 ret i32 %tmp1 287} 288 289define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind { 290; X32-LABEL: ptestz_3: 291; X32: ## BB#0: 292; X32-NEXT: ptest %xmm1, %xmm0 293; X32-NEXT: seta %al 294; X32-NEXT: movzbl %al, %eax 295; X32-NEXT: retl 296; 297; X64-LABEL: ptestz_3: 298; X64: ## BB#0: 299; X64-NEXT: ptest %xmm1, %xmm0 300; X64-NEXT: seta %al 301; X64-NEXT: movzbl %al, %eax 302; X64-NEXT: retq 303 %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone 304 ret i32 %tmp1 305} 306 307 308declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone 309declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone 310declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone 311 312; This used to compile to insertps $0 + insertps $16. insertps $0 is always 313; pointless. 314define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { 315; X32-LABEL: buildvector: 316; X32: ## BB#0: ## %entry 317; X32-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 318; X32-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 319; X32-NEXT: addss %xmm1, %xmm0 320; X32-NEXT: addss %xmm2, %xmm3 321; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] 322; X32-NEXT: retl 323; 324; X64-LABEL: buildvector: 325; X64: ## BB#0: ## %entry 326; X64-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 327; X64-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 328; X64-NEXT: addss %xmm1, %xmm0 329; X64-NEXT: addss %xmm2, %xmm3 330; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] 331; X64-NEXT: retq 332entry: 333 %tmp7 = extractelement <2 x float> %A, i32 0 334 %tmp5 = extractelement <2 x float> %A, i32 1 335 %tmp3 = extractelement <2 x float> %B, i32 0 336 %tmp1 = extractelement <2 x float> %B, i32 1 337 %add.r = fadd float %tmp7, %tmp3 338 %add.i = fadd float %tmp5, %tmp1 339 %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0 340 %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1 341 ret <2 x float> %tmp9 342} 343 344define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) { 345; X32-LABEL: insertps_from_shufflevector_1: 346; X32: ## BB#0: ## %entry 347; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 348; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 349; X32-NEXT: retl 350; 351; X64-LABEL: insertps_from_shufflevector_1: 352; X64: ## BB#0: ## %entry 353; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 354; X64-NEXT: retq 355entry: 356 %0 = load <4 x float>, <4 x float>* %pb, align 16 357 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 358 ret <4 x float> %vecinit6 359} 360 361define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) { 362; X32-LABEL: insertps_from_shufflevector_2: 363; X32: ## BB#0: ## %entry 364; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 365; X32-NEXT: retl 366; 367; X64-LABEL: insertps_from_shufflevector_2: 368; X64: ## BB#0: ## %entry 369; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 370; X64-NEXT: retq 371entry: 372 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3> 373 ret <4 x float> %vecinit6 374} 375 376; For loading an i32 from memory into an xmm register we use pinsrd 377; instead of insertps 378define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) { 379; X32-LABEL: pinsrd_from_shufflevector_i32: 380; X32: ## BB#0: ## %entry 381; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 382; X32-NEXT: pshufd {{.*#+}} xmm1 = mem[0,1,2,0] 383; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 384; X32-NEXT: retl 385; 386; X64-LABEL: pinsrd_from_shufflevector_i32: 387; X64: ## BB#0: ## %entry 388; X64-NEXT: pshufd {{.*#+}} xmm1 = mem[0,1,2,0] 389; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 390; X64-NEXT: retq 391entry: 392 %0 = load <4 x i32>, <4 x i32>* %pb, align 16 393 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 394 ret <4 x i32> %vecinit6 395} 396 397define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) { 398; X32-LABEL: insertps_from_shufflevector_i32_2: 399; X32: ## BB#0: ## %entry 400; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 401; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 402; X32-NEXT: retl 403; 404; X64-LABEL: insertps_from_shufflevector_i32_2: 405; X64: ## BB#0: ## %entry 406; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 407; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 408; X64-NEXT: retq 409entry: 410 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3> 411 ret <4 x i32> %vecinit6 412} 413 414define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) { 415; X32-LABEL: insertps_from_load_ins_elt_undef: 416; X32: ## BB#0: 417; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 418; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 419; X32-NEXT: retl 420; 421; X64-LABEL: insertps_from_load_ins_elt_undef: 422; X64: ## BB#0: 423; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 424; X64-NEXT: retq 425 %1 = load float, float* %b, align 4 426 %2 = insertelement <4 x float> undef, float %1, i32 0 427 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3> 428 ret <4 x float> %result 429} 430 431; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr 432define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) { 433; X32-LABEL: insertps_from_load_ins_elt_undef_i32: 434; X32: ## BB#0: 435; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 436; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 437; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 438; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] 439; X32-NEXT: retl 440; 441; X64-LABEL: insertps_from_load_ins_elt_undef_i32: 442; X64: ## BB#0: 443; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 444; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 445; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] 446; X64-NEXT: retq 447 %1 = load i32, i32* %b, align 4 448 %2 = insertelement <4 x i32> undef, i32 %1, i32 0 449 %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3> 450 ret <4 x i32> %result 451} 452 453;;;;;; Shuffles optimizable with a single insertps or blend instruction 454define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) { 455; X32-LABEL: shuf_XYZ0: 456; X32: ## BB#0: 457; X32-NEXT: xorps %xmm1, %xmm1 458; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 459; X32-NEXT: retl 460; 461; X64-LABEL: shuf_XYZ0: 462; X64: ## BB#0: 463; X64-NEXT: xorps %xmm1, %xmm1 464; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 465; X64-NEXT: retq 466 %vecext = extractelement <4 x float> %x, i32 0 467 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 468 %vecext1 = extractelement <4 x float> %x, i32 1 469 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 470 %vecext3 = extractelement <4 x float> %x, i32 2 471 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2 472 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 473 ret <4 x float> %vecinit5 474} 475 476define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) { 477; X32-LABEL: shuf_XY00: 478; X32: ## BB#0: 479; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 480; X32-NEXT: retl 481; 482; X64-LABEL: shuf_XY00: 483; X64: ## BB#0: 484; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 485; X64-NEXT: retq 486 %vecext = extractelement <4 x float> %x, i32 0 487 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 488 %vecext1 = extractelement <4 x float> %x, i32 1 489 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 490 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2 491 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3 492 ret <4 x float> %vecinit4 493} 494 495define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) { 496; X32-LABEL: shuf_XYY0: 497; X32: ## BB#0: 498; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero 499; X32-NEXT: retl 500; 501; X64-LABEL: shuf_XYY0: 502; X64: ## BB#0: 503; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero 504; X64-NEXT: retq 505 %vecext = extractelement <4 x float> %x, i32 0 506 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 507 %vecext1 = extractelement <4 x float> %x, i32 1 508 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 509 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2 510 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 511 ret <4 x float> %vecinit5 512} 513 514define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) { 515; X32-LABEL: shuf_XYW0: 516; X32: ## BB#0: 517; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero 518; X32-NEXT: retl 519; 520; X64-LABEL: shuf_XYW0: 521; X64: ## BB#0: 522; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero 523; X64-NEXT: retq 524 %vecext = extractelement <4 x float> %x, i32 0 525 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 526 %vecext1 = extractelement <4 x float> %x, i32 1 527 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 528 %vecext2 = extractelement <4 x float> %x, i32 3 529 %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2 530 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3 531 ret <4 x float> %vecinit4 532} 533 534define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) { 535; X32-LABEL: shuf_W00W: 536; X32: ## BB#0: 537; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3] 538; X32-NEXT: retl 539; 540; X64-LABEL: shuf_W00W: 541; X64: ## BB#0: 542; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3] 543; X64-NEXT: retq 544 %vecext = extractelement <4 x float> %x, i32 3 545 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 546 %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1 547 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2 548 %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3 549 ret <4 x float> %vecinit4 550} 551 552define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) { 553; X32-LABEL: shuf_X00A: 554; X32: ## BB#0: 555; X32-NEXT: xorps %xmm2, %xmm2 556; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] 557; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 558; X32-NEXT: retl 559; 560; X64-LABEL: shuf_X00A: 561; X64: ## BB#0: 562; X64-NEXT: xorps %xmm2, %xmm2 563; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] 564; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 565; X64-NEXT: retq 566 %vecext = extractelement <4 x float> %x, i32 0 567 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 568 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 569 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2 570 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 571 ret <4 x float> %vecinit4 572} 573 574define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) { 575; X32-LABEL: shuf_X00X: 576; X32: ## BB#0: 577; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0] 578; X32-NEXT: retl 579; 580; X64-LABEL: shuf_X00X: 581; X64: ## BB#0: 582; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0] 583; X64-NEXT: retq 584 %vecext = extractelement <4 x float> %x, i32 0 585 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 586 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 587 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2 588 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 589 ret <4 x float> %vecinit4 590} 591 592define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) { 593; X32-LABEL: shuf_X0YC: 594; X32: ## BB#0: 595; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 596; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2] 597; X32-NEXT: retl 598; 599; X64-LABEL: shuf_X0YC: 600; X64: ## BB#0: 601; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 602; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2] 603; X64-NEXT: retq 604 %vecext = extractelement <4 x float> %x, i32 0 605 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 606 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 607 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef> 608 %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6> 609 ret <4 x float> %vecinit5 610} 611 612define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) { 613; X32-LABEL: i32_shuf_XYZ0: 614; X32: ## BB#0: 615; X32-NEXT: pxor %xmm1, %xmm1 616; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 617; X32-NEXT: retl 618; 619; X64-LABEL: i32_shuf_XYZ0: 620; X64: ## BB#0: 621; X64-NEXT: pxor %xmm1, %xmm1 622; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 623; X64-NEXT: retq 624 %vecext = extractelement <4 x i32> %x, i32 0 625 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 626 %vecext1 = extractelement <4 x i32> %x, i32 1 627 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 628 %vecext3 = extractelement <4 x i32> %x, i32 2 629 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2 630 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3 631 ret <4 x i32> %vecinit5 632} 633 634define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) { 635; X32-LABEL: i32_shuf_XY00: 636; X32: ## BB#0: 637; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 638; X32-NEXT: retl 639; 640; X64-LABEL: i32_shuf_XY00: 641; X64: ## BB#0: 642; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 643; X64-NEXT: retq 644 %vecext = extractelement <4 x i32> %x, i32 0 645 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 646 %vecext1 = extractelement <4 x i32> %x, i32 1 647 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 648 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2 649 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3 650 ret <4 x i32> %vecinit4 651} 652 653define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) { 654; X32-LABEL: i32_shuf_XYY0: 655; X32: ## BB#0: 656; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] 657; X32-NEXT: pxor %xmm0, %xmm0 658; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 659; X32-NEXT: retl 660; 661; X64-LABEL: i32_shuf_XYY0: 662; X64: ## BB#0: 663; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] 664; X64-NEXT: pxor %xmm0, %xmm0 665; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 666; X64-NEXT: retq 667 %vecext = extractelement <4 x i32> %x, i32 0 668 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 669 %vecext1 = extractelement <4 x i32> %x, i32 1 670 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 671 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2 672 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3 673 ret <4 x i32> %vecinit5 674} 675 676define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) { 677; X32-LABEL: i32_shuf_XYW0: 678; X32: ## BB#0: 679; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3] 680; X32-NEXT: pxor %xmm0, %xmm0 681; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 682; X32-NEXT: retl 683; 684; X64-LABEL: i32_shuf_XYW0: 685; X64: ## BB#0: 686; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3] 687; X64-NEXT: pxor %xmm0, %xmm0 688; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 689; X64-NEXT: retq 690 %vecext = extractelement <4 x i32> %x, i32 0 691 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 692 %vecext1 = extractelement <4 x i32> %x, i32 1 693 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 694 %vecext2 = extractelement <4 x i32> %x, i32 3 695 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2 696 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3 697 ret <4 x i32> %vecinit4 698} 699 700define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) { 701; X32-LABEL: i32_shuf_W00W: 702; X32: ## BB#0: 703; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 704; X32-NEXT: pxor %xmm0, %xmm0 705; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 706; X32-NEXT: retl 707; 708; X64-LABEL: i32_shuf_W00W: 709; X64: ## BB#0: 710; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 711; X64-NEXT: pxor %xmm0, %xmm0 712; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 713; X64-NEXT: retq 714 %vecext = extractelement <4 x i32> %x, i32 3 715 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 716 %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1 717 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2 718 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3 719 ret <4 x i32> %vecinit4 720} 721 722define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { 723; X32-LABEL: i32_shuf_X00A: 724; X32: ## BB#0: 725; X32-NEXT: pxor %xmm2, %xmm2 726; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 727; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 728; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 729; X32-NEXT: retl 730; 731; X64-LABEL: i32_shuf_X00A: 732; X64: ## BB#0: 733; X64-NEXT: pxor %xmm2, %xmm2 734; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 735; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 736; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 737; X64-NEXT: retq 738 %vecext = extractelement <4 x i32> %x, i32 0 739 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 740 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 741 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 742 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 743 ret <4 x i32> %vecinit4 744} 745 746define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { 747; X32-LABEL: i32_shuf_X00X: 748; X32: ## BB#0: 749; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0] 750; X32-NEXT: pxor %xmm0, %xmm0 751; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 752; X32-NEXT: retl 753; 754; X64-LABEL: i32_shuf_X00X: 755; X64: ## BB#0: 756; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0] 757; X64-NEXT: pxor %xmm0, %xmm0 758; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 759; X64-NEXT: retq 760 %vecext = extractelement <4 x i32> %x, i32 0 761 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 762 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 763 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 764 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 765 ret <4 x i32> %vecinit4 766} 767 768define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { 769; X32-LABEL: i32_shuf_X0YC: 770; X32: ## BB#0: 771; X32-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 772; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2] 773; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] 774; X32-NEXT: retl 775; 776; X64-LABEL: i32_shuf_X0YC: 777; X64: ## BB#0: 778; X64-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 779; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2] 780; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] 781; X64-NEXT: retq 782 %vecext = extractelement <4 x i32> %x, i32 0 783 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 784 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 785 %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef> 786 %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6> 787 ret <4 x i32> %vecinit5 788} 789 790;; Test for a bug in the first implementation of LowerBuildVectorv4x32 791define < 4 x float> @test_insertps_no_undef(<4 x float> %x) { 792; X32-LABEL: test_insertps_no_undef: 793; X32: ## BB#0: 794; X32-NEXT: xorps %xmm1, %xmm1 795; X32-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] 796; X32-NEXT: maxps %xmm1, %xmm0 797; X32-NEXT: retl 798; 799; X64-LABEL: test_insertps_no_undef: 800; X64: ## BB#0: 801; X64-NEXT: xorps %xmm1, %xmm1 802; X64-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] 803; X64-NEXT: maxps %xmm1, %xmm0 804; X64-NEXT: retq 805 %vecext = extractelement <4 x float> %x, i32 0 806 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 807 %vecext1 = extractelement <4 x float> %x, i32 1 808 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 809 %vecext3 = extractelement <4 x float> %x, i32 2 810 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2 811 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 812 %mask = fcmp olt <4 x float> %vecinit5, %x 813 %res = select <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5 814 ret <4 x float> %res 815} 816 817define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) { 818; X32-LABEL: blendvb_fallback: 819; X32: ## BB#0: 820; X32-NEXT: psllw $15, %xmm0 821; X32-NEXT: psraw $15, %xmm0 822; X32-NEXT: pblendvb %xmm1, %xmm2 823; X32-NEXT: movdqa %xmm2, %xmm0 824; X32-NEXT: retl 825; 826; X64-LABEL: blendvb_fallback: 827; X64: ## BB#0: 828; X64-NEXT: psllw $15, %xmm0 829; X64-NEXT: psraw $15, %xmm0 830; X64-NEXT: pblendvb %xmm1, %xmm2 831; X64-NEXT: movdqa %xmm2, %xmm0 832; X64-NEXT: retq 833 %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y 834 ret <8 x i16> %ret 835} 836 837; On X32, account for the argument's move to registers 838define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) { 839; X32-LABEL: insertps_from_vector_load: 840; X32: ## BB#0: 841; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 842; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 843; X32-NEXT: retl 844; 845; X64-LABEL: insertps_from_vector_load: 846; X64: ## BB#0: 847; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 848; X64-NEXT: retq 849 %1 = load <4 x float>, <4 x float>* %pb, align 16 850 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) 851 ret <4 x float> %2 852} 853 854;; Use a non-zero CountS for insertps 855;; Try to match a bit more of the instr, since we need the load's offset. 856define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) { 857; X32-LABEL: insertps_from_vector_load_offset: 858; X32: ## BB#0: 859; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 860; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3] 861; X32-NEXT: retl 862; 863; X64-LABEL: insertps_from_vector_load_offset: 864; X64: ## BB#0: 865; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3] 866; X64-NEXT: retq 867 %1 = load <4 x float>, <4 x float>* %pb, align 16 868 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) 869 ret <4 x float> %2 870} 871 872;; Try to match a bit more of the instr, since we need the load's offset. 873define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) { 874; X32-LABEL: insertps_from_vector_load_offset_2: 875; X32: ## BB#0: 876; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 877; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 878; X32-NEXT: shll $4, %ecx 879; X32-NEXT: insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3] 880; X32-NEXT: retl 881; 882; X64-LABEL: insertps_from_vector_load_offset_2: 883; X64: ## BB#0: 884; X64-NEXT: shlq $4, %rsi 885; X64-NEXT: insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3] 886; X64-NEXT: retq 887 %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index 888 %2 = load <4 x float>, <4 x float>* %1, align 16 889 %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192) 890 ret <4 x float> %3 891} 892 893define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) { 894; X32-LABEL: insertps_from_broadcast_loadf32: 895; X32: ## BB#0: 896; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 897; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 898; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 899; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] 900; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 901; X32-NEXT: retl 902; 903; X64-LABEL: insertps_from_broadcast_loadf32: 904; X64: ## BB#0: 905; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 906; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] 907; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 908; X64-NEXT: retq 909 %1 = getelementptr inbounds float, float* %fb, i64 %index 910 %2 = load float, float* %1, align 4 911 %3 = insertelement <4 x float> undef, float %2, i32 0 912 %4 = insertelement <4 x float> %3, float %2, i32 1 913 %5 = insertelement <4 x float> %4, float %2, i32 2 914 %6 = insertelement <4 x float> %5, float %2, i32 3 915 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) 916 ret <4 x float> %7 917} 918 919define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) { 920; X32-LABEL: insertps_from_broadcast_loadv4f32: 921; X32: ## BB#0: 922; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 923; X32-NEXT: movups (%eax), %xmm1 924; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] 925; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 926; X32-NEXT: retl 927; 928; X64-LABEL: insertps_from_broadcast_loadv4f32: 929; X64: ## BB#0: 930; X64-NEXT: movups (%rdi), %xmm1 931; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] 932; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 933; X64-NEXT: retq 934 %1 = load <4 x float>, <4 x float>* %b, align 4 935 %2 = extractelement <4 x float> %1, i32 0 936 %3 = insertelement <4 x float> undef, float %2, i32 0 937 %4 = insertelement <4 x float> %3, float %2, i32 1 938 %5 = insertelement <4 x float> %4, float %2, i32 2 939 %6 = insertelement <4 x float> %5, float %2, i32 3 940 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) 941 ret <4 x float> %7 942} 943 944;; FIXME: We're emitting an extraneous pshufd/vbroadcast. 945define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) { 946; X32-LABEL: insertps_from_broadcast_multiple_use: 947; X32: ## BB#0: 948; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 949; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 950; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero 951; X32-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0] 952; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] 953; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] 954; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] 955; X32-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] 956; X32-NEXT: addps %xmm1, %xmm0 957; X32-NEXT: addps %xmm2, %xmm3 958; X32-NEXT: addps %xmm3, %xmm0 959; X32-NEXT: retl 960; 961; X64-LABEL: insertps_from_broadcast_multiple_use: 962; X64: ## BB#0: 963; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero 964; X64-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0] 965; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] 966; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] 967; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] 968; X64-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] 969; X64-NEXT: addps %xmm1, %xmm0 970; X64-NEXT: addps %xmm2, %xmm3 971; X64-NEXT: addps %xmm3, %xmm0 972; X64-NEXT: retq 973 %1 = getelementptr inbounds float, float* %fb, i64 %index 974 %2 = load float, float* %1, align 4 975 %3 = insertelement <4 x float> undef, float %2, i32 0 976 %4 = insertelement <4 x float> %3, float %2, i32 1 977 %5 = insertelement <4 x float> %4, float %2, i32 2 978 %6 = insertelement <4 x float> %5, float %2, i32 3 979 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) 980 %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48) 981 %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48) 982 %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48) 983 %11 = fadd <4 x float> %7, %8 984 %12 = fadd <4 x float> %9, %10 985 %13 = fadd <4 x float> %11, %12 986 ret <4 x float> %13 987} 988 989define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) { 990; X32-LABEL: insertps_with_undefs: 991; X32: ## BB#0: 992; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 993; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 994; X32-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] 995; X32-NEXT: movapd %xmm1, %xmm0 996; X32-NEXT: retl 997; 998; X64-LABEL: insertps_with_undefs: 999; X64: ## BB#0: 1000; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1001; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] 1002; X64-NEXT: movapd %xmm1, %xmm0 1003; X64-NEXT: retq 1004 %1 = load float, float* %b, align 4 1005 %2 = insertelement <4 x float> undef, float %1, i32 0 1006 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7> 1007 ret <4 x float> %result 1008} 1009 1010; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using 1011; the destination index to change the load, instead of the source index. 1012define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) { 1013; X32-LABEL: pr20087: 1014; X32: ## BB#0: 1015; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1016; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2] 1017; X32-NEXT: retl 1018; 1019; X64-LABEL: pr20087: 1020; X64: ## BB#0: 1021; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2] 1022; X64-NEXT: retq 1023 %load = load <4 x float> , <4 x float> *%ptr 1024 %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2> 1025 ret <4 x float> %ret 1026} 1027 1028; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1> 1029define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* noalias nocapture %RET) #1 { 1030; X32-LABEL: insertps_pr20411: 1031; X32: ## BB#0: 1032; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1033; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1034; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1035; X32-NEXT: movdqu %xmm1, (%eax) 1036; X32-NEXT: retl 1037; 1038; X64-LABEL: insertps_pr20411: 1039; X64: ## BB#0: 1040; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1041; X64-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1042; X64-NEXT: movdqu %xmm1, (%rdi) 1043; X64-NEXT: retq 1044 %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 0, i32 7, i32 undef, i32 undef> 1045 %ptrcast = bitcast i32* %RET to <4 x i32>* 1046 store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4 1047 ret void 1048} 1049 1050define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) { 1051; X32-LABEL: insertps_4: 1052; X32: ## BB#0: ## %entry 1053; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero 1054; X32-NEXT: retl 1055; 1056; X64-LABEL: insertps_4: 1057; X64: ## BB#0: ## %entry 1058; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero 1059; X64-NEXT: retq 1060entry: 1061 %vecext = extractelement <4 x float> %A, i32 0 1062 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1063 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1 1064 %vecext2 = extractelement <4 x float> %B, i32 2 1065 %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2 1066 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 1067 ret <4 x float> %vecinit4 1068} 1069 1070define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) { 1071; X32-LABEL: insertps_5: 1072; X32: ## BB#0: ## %entry 1073; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero 1074; X32-NEXT: retl 1075; 1076; X64-LABEL: insertps_5: 1077; X64: ## BB#0: ## %entry 1078; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero 1079; X64-NEXT: retq 1080entry: 1081 %vecext = extractelement <4 x float> %A, i32 0 1082 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1083 %vecext1 = extractelement <4 x float> %B, i32 1 1084 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 1085 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2 1086 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 1087 ret <4 x float> %vecinit4 1088} 1089 1090define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) { 1091; X32-LABEL: insertps_6: 1092; X32: ## BB#0: ## %entry 1093; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero 1094; X32-NEXT: retl 1095; 1096; X64-LABEL: insertps_6: 1097; X64: ## BB#0: ## %entry 1098; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero 1099; X64-NEXT: retq 1100entry: 1101 %vecext = extractelement <4 x float> %A, i32 1 1102 %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1 1103 %vecext1 = extractelement <4 x float> %B, i32 2 1104 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2 1105 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3 1106 ret <4 x float> %vecinit3 1107} 1108 1109define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) { 1110; X32-LABEL: insertps_7: 1111; X32: ## BB#0: ## %entry 1112; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero 1113; X32-NEXT: retl 1114; 1115; X64-LABEL: insertps_7: 1116; X64: ## BB#0: ## %entry 1117; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero 1118; X64-NEXT: retq 1119entry: 1120 %vecext = extractelement <4 x float> %A, i32 0 1121 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1122 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1 1123 %vecext2 = extractelement <4 x float> %B, i32 1 1124 %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2 1125 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 1126 ret <4 x float> %vecinit4 1127} 1128 1129define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) { 1130; X32-LABEL: insertps_8: 1131; X32: ## BB#0: ## %entry 1132; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero 1133; X32-NEXT: retl 1134; 1135; X64-LABEL: insertps_8: 1136; X64: ## BB#0: ## %entry 1137; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero 1138; X64-NEXT: retq 1139entry: 1140 %vecext = extractelement <4 x float> %A, i32 0 1141 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1142 %vecext1 = extractelement <4 x float> %B, i32 0 1143 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 1144 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2 1145 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 1146 ret <4 x float> %vecinit4 1147} 1148 1149define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) { 1150; X32-LABEL: insertps_9: 1151; X32: ## BB#0: ## %entry 1152; X32-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero 1153; X32-NEXT: movaps %xmm1, %xmm0 1154; X32-NEXT: retl 1155; 1156; X64-LABEL: insertps_9: 1157; X64: ## BB#0: ## %entry 1158; X64-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero 1159; X64-NEXT: movaps %xmm1, %xmm0 1160; X64-NEXT: retq 1161entry: 1162 %vecext = extractelement <4 x float> %A, i32 0 1163 %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1 1164 %vecext1 = extractelement <4 x float> %B, i32 2 1165 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2 1166 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3 1167 ret <4 x float> %vecinit3 1168} 1169 1170define <4 x float> @insertps_10(<4 x float> %A) 1171; X32-LABEL: insertps_10: 1172; X32: ## BB#0: 1173; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero 1174; X32-NEXT: retl 1175; 1176; X64-LABEL: insertps_10: 1177; X64: ## BB#0: 1178; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero 1179; X64-NEXT: retq 1180{ 1181 %vecext = extractelement <4 x float> %A, i32 0 1182 %vecbuild1 = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %vecext, i32 0 1183 %vecbuild2 = insertelement <4 x float> %vecbuild1, float %vecext, i32 2 1184 ret <4 x float> %vecbuild2 1185} 1186 1187define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) { 1188; X32-LABEL: build_vector_to_shuffle_1: 1189; X32: ## BB#0: ## %entry 1190; X32-NEXT: xorps %xmm1, %xmm1 1191; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 1192; X32-NEXT: retl 1193; 1194; X64-LABEL: build_vector_to_shuffle_1: 1195; X64: ## BB#0: ## %entry 1196; X64-NEXT: xorps %xmm1, %xmm1 1197; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 1198; X64-NEXT: retq 1199entry: 1200 %vecext = extractelement <4 x float> %A, i32 1 1201 %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1 1202 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2 1203 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1204 ret <4 x float> %vecinit3 1205} 1206 1207define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) { 1208; X32-LABEL: build_vector_to_shuffle_2: 1209; X32: ## BB#0: ## %entry 1210; X32-NEXT: xorps %xmm1, %xmm1 1211; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1212; X32-NEXT: retl 1213; 1214; X64-LABEL: build_vector_to_shuffle_2: 1215; X64: ## BB#0: ## %entry 1216; X64-NEXT: xorps %xmm1, %xmm1 1217; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1218; X64-NEXT: retq 1219entry: 1220 %vecext = extractelement <4 x float> %A, i32 1 1221 %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1 1222 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2 1223 ret <4 x float> %vecinit1 1224} 1225