1//===---------------------------------------------------------------------===// 2// Random ideas for the X86 backend: SSE-specific stuff. 3//===---------------------------------------------------------------------===// 4 5//===---------------------------------------------------------------------===// 6 7SSE Variable shift can be custom lowered to something like this, which uses a 8small table + unaligned load + shuffle instead of going through memory. 9 10__m128i_shift_right: 11 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 12 .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 13 14... 15__m128i shift_right(__m128i value, unsigned long offset) { 16 return _mm_shuffle_epi8(value, 17 _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset))); 18} 19 20//===---------------------------------------------------------------------===// 21 22SSE has instructions for doing operations on complex numbers, we should pattern 23match them. For example, this should turn into a horizontal add: 24 25typedef float __attribute__((vector_size(16))) v4f32; 26float f32(v4f32 A) { 27 return A[0]+A[1]+A[2]+A[3]; 28} 29 30Instead we get this: 31 32_f32: ## @f32 33 pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0] 34 addss %xmm0, %xmm1 35 pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0] 36 movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1] 37 movaps %xmm0, %xmm3 38 addss %xmm1, %xmm3 39 movdqa %xmm2, %xmm0 40 addss %xmm3, %xmm0 41 ret 42 43Also, there are cases where some simple local SLP would improve codegen a bit. 44compiling this: 45 46_Complex float f32(_Complex float A, _Complex float B) { 47 return A+B; 48} 49 50into: 51 52_f32: ## @f32 53 movdqa %xmm0, %xmm2 54 addss %xmm1, %xmm2 55 pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0] 56 pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0] 57 addss %xmm1, %xmm3 58 movaps %xmm2, %xmm0 59 unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 60 ret 61 62seems silly when it could just be one addps. 63 64 65//===---------------------------------------------------------------------===// 66 67Expand libm rounding functions inline: Significant speedups possible. 68http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html 69 70//===---------------------------------------------------------------------===// 71 72When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and 73other fast SSE modes. 74 75//===---------------------------------------------------------------------===// 76 77Think about doing i64 math in SSE regs on x86-32. 78 79//===---------------------------------------------------------------------===// 80 81This testcase should have no SSE instructions in it, and only one load from 82a constant pool: 83 84double %test3(bool %B) { 85 %C = select bool %B, double 123.412, double 523.01123123 86 ret double %C 87} 88 89Currently, the select is being lowered, which prevents the dag combiner from 90turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)' 91 92The pattern isel got this one right. 93 94//===---------------------------------------------------------------------===// 95 96SSE should implement 'select_cc' using 'emulated conditional moves' that use 97pcmp/pand/pandn/por to do a selection instead of a conditional branch: 98 99double %X(double %Y, double %Z, double %A, double %B) { 100 %C = setlt double %A, %B 101 %z = fadd double %Z, 0.0 ;; select operand is not a load 102 %D = select bool %C, double %Y, double %z 103 ret double %D 104} 105 106We currently emit: 107 108_X: 109 subl $12, %esp 110 xorpd %xmm0, %xmm0 111 addsd 24(%esp), %xmm0 112 movsd 32(%esp), %xmm1 113 movsd 16(%esp), %xmm2 114 ucomisd 40(%esp), %xmm1 115 jb LBB_X_2 116LBB_X_1: 117 movsd %xmm0, %xmm2 118LBB_X_2: 119 movsd %xmm2, (%esp) 120 fldl (%esp) 121 addl $12, %esp 122 ret 123 124//===---------------------------------------------------------------------===// 125 126Lower memcpy / memset to a series of SSE 128 bit move instructions when it's 127feasible. 128 129//===---------------------------------------------------------------------===// 130 131Codegen: 132 if (copysign(1.0, x) == copysign(1.0, y)) 133into: 134 if (x^y & mask) 135when using SSE. 136 137//===---------------------------------------------------------------------===// 138 139Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half 140of a v4sf value. 141 142//===---------------------------------------------------------------------===// 143 144Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}. 145Perhaps use pxor / xorp* to clear a XMM register first? 146 147//===---------------------------------------------------------------------===// 148 149External test Nurbs exposed some problems. Look for 150__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc 151emits: 152 153 movaps (%edx), %xmm2 #59.21 154 movaps (%edx), %xmm5 #60.21 155 movaps (%edx), %xmm4 #61.21 156 movaps (%edx), %xmm3 #62.21 157 movl 40(%ecx), %ebp #69.49 158 shufps $0, %xmm2, %xmm5 #60.21 159 movl 100(%esp), %ebx #69.20 160 movl (%ebx), %edi #69.20 161 imull %ebp, %edi #69.49 162 addl (%eax), %edi #70.33 163 shufps $85, %xmm2, %xmm4 #61.21 164 shufps $170, %xmm2, %xmm3 #62.21 165 shufps $255, %xmm2, %xmm2 #63.21 166 lea (%ebp,%ebp,2), %ebx #69.49 167 negl %ebx #69.49 168 lea -3(%edi,%ebx), %ebx #70.33 169 shll $4, %ebx #68.37 170 addl 32(%ecx), %ebx #68.37 171 testb $15, %bl #91.13 172 jne L_B1.24 # Prob 5% #91.13 173 174This is the llvm code after instruction scheduling: 175 176cond_next140 (0xa910740, LLVM BB @0xa90beb0): 177 %reg1078 = MOV32ri -3 178 %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0 179 %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40 180 %reg1080 = IMUL32rr %reg1079, %reg1037 181 %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0 182 %reg1038 = LEA32r %reg1081, 1, %reg1080, -3 183 %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32 184 %reg1082 = SHL32ri %reg1038, 4 185 %reg1039 = ADD32rr %reg1036, %reg1082 186 %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0 187 %reg1034 = SHUFPSrr %reg1083, %reg1083, 170 188 %reg1032 = SHUFPSrr %reg1083, %reg1083, 0 189 %reg1035 = SHUFPSrr %reg1083, %reg1083, 255 190 %reg1033 = SHUFPSrr %reg1083, %reg1083, 85 191 %reg1040 = MOV32rr %reg1039 192 %reg1084 = AND32ri8 %reg1039, 15 193 CMP32ri8 %reg1084, 0 194 JE mbb<cond_next204,0xa914d30> 195 196Still ok. After register allocation: 197 198cond_next140 (0xa910740, LLVM BB @0xa90beb0): 199 %EAX = MOV32ri -3 200 %EDX = MOV32rm <fi#3>, 1, %NOREG, 0 201 ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0 202 %EDX = MOV32rm <fi#7>, 1, %NOREG, 0 203 %EDX = MOV32rm %EDX, 1, %NOREG, 40 204 IMUL32rr %EAX<def&use>, %EDX 205 %ESI = MOV32rm <fi#5>, 1, %NOREG, 0 206 %ESI = MOV32rm %ESI, 1, %NOREG, 0 207 MOV32mr <fi#4>, 1, %NOREG, 0, %ESI 208 %EAX = LEA32r %ESI, 1, %EAX, -3 209 %ESI = MOV32rm <fi#7>, 1, %NOREG, 0 210 %ESI = MOV32rm %ESI, 1, %NOREG, 32 211 %EDI = MOV32rr %EAX 212 SHL32ri %EDI<def&use>, 4 213 ADD32rr %EDI<def&use>, %ESI 214 %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0 215 %XMM1 = MOVAPSrr %XMM0 216 SHUFPSrr %XMM1<def&use>, %XMM1, 170 217 %XMM2 = MOVAPSrr %XMM0 218 SHUFPSrr %XMM2<def&use>, %XMM2, 0 219 %XMM3 = MOVAPSrr %XMM0 220 SHUFPSrr %XMM3<def&use>, %XMM3, 255 221 SHUFPSrr %XMM0<def&use>, %XMM0, 85 222 %EBX = MOV32rr %EDI 223 AND32ri8 %EBX<def&use>, 15 224 CMP32ri8 %EBX, 0 225 JE mbb<cond_next204,0xa914d30> 226 227This looks really bad. The problem is shufps is a destructive opcode. Since it 228appears as operand two in more than one shufps ops. It resulted in a number of 229copies. Note icc also suffers from the same problem. Either the instruction 230selector should select pshufd or The register allocator can made the two-address 231to three-address transformation. 232 233It also exposes some other problems. See MOV32ri -3 and the spills. 234 235//===---------------------------------------------------------------------===// 236 237Consider: 238 239__m128 test(float a) { 240 return _mm_set_ps(0.0, 0.0, 0.0, a*a); 241} 242 243This compiles into: 244 245movss 4(%esp), %xmm1 246mulss %xmm1, %xmm1 247xorps %xmm0, %xmm0 248movss %xmm1, %xmm0 249ret 250 251Because mulss doesn't modify the top 3 elements, the top elements of 252xmm1 are already zero'd. We could compile this to: 253 254movss 4(%esp), %xmm0 255mulss %xmm0, %xmm0 256ret 257 258//===---------------------------------------------------------------------===// 259 260Here's a sick and twisted idea. Consider code like this: 261 262__m128 test(__m128 a) { 263 float b = *(float*)&A; 264 ... 265 return _mm_set_ps(0.0, 0.0, 0.0, b); 266} 267 268This might compile to this code: 269 270movaps c(%esp), %xmm1 271xorps %xmm0, %xmm0 272movss %xmm1, %xmm0 273ret 274 275Now consider if the ... code caused xmm1 to get spilled. This might produce 276this code: 277 278movaps c(%esp), %xmm1 279movaps %xmm1, c2(%esp) 280... 281 282xorps %xmm0, %xmm0 283movaps c2(%esp), %xmm1 284movss %xmm1, %xmm0 285ret 286 287However, since the reload is only used by these instructions, we could 288"fold" it into the uses, producing something like this: 289 290movaps c(%esp), %xmm1 291movaps %xmm1, c2(%esp) 292... 293 294movss c2(%esp), %xmm0 295ret 296 297... saving two instructions. 298 299The basic idea is that a reload from a spill slot, can, if only one 4-byte 300chunk is used, bring in 3 zeros the one element instead of 4 elements. 301This can be used to simplify a variety of shuffle operations, where the 302elements are fixed zeros. 303 304//===---------------------------------------------------------------------===// 305 306This code generates ugly code, probably due to costs being off or something: 307 308define void @test(float* %P, <4 x float>* %P2 ) { 309 %xFloat0.688 = load float* %P 310 %tmp = load <4 x float>* %P2 311 %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3 312 store <4 x float> %inFloat3.713, <4 x float>* %P2 313 ret void 314} 315 316Generates: 317 318_test: 319 movl 8(%esp), %eax 320 movaps (%eax), %xmm0 321 pxor %xmm1, %xmm1 322 movaps %xmm0, %xmm2 323 shufps $50, %xmm1, %xmm2 324 shufps $132, %xmm2, %xmm0 325 movaps %xmm0, (%eax) 326 ret 327 328Would it be better to generate: 329 330_test: 331 movl 8(%esp), %ecx 332 movaps (%ecx), %xmm0 333 xor %eax, %eax 334 pinsrw $6, %eax, %xmm0 335 pinsrw $7, %eax, %xmm0 336 movaps %xmm0, (%ecx) 337 ret 338 339? 340 341//===---------------------------------------------------------------------===// 342 343Some useful information in the Apple Altivec / SSE Migration Guide: 344 345http://developer.apple.com/documentation/Performance/Conceptual/ 346Accelerate_sse_migration/index.html 347 348e.g. SSE select using and, andnot, or. Various SSE compare translations. 349 350//===---------------------------------------------------------------------===// 351 352Add hooks to commute some CMPP operations. 353 354//===---------------------------------------------------------------------===// 355 356Apply the same transformation that merged four float into a single 128-bit load 357to loads from constant pool. 358 359//===---------------------------------------------------------------------===// 360 361Floating point max / min are commutable when -enable-unsafe-fp-path is 362specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other 363nodes which are selected to max / min instructions that are marked commutable. 364 365//===---------------------------------------------------------------------===// 366 367We should materialize vector constants like "all ones" and "signbit" with 368code like: 369 370 cmpeqps xmm1, xmm1 ; xmm1 = all-ones 371 372and: 373 cmpeqps xmm1, xmm1 ; xmm1 = all-ones 374 psrlq xmm1, 31 ; xmm1 = all 100000000000... 375 376instead of using a load from the constant pool. The later is important for 377ABS/NEG/copysign etc. 378 379//===---------------------------------------------------------------------===// 380 381These functions: 382 383#include <xmmintrin.h> 384__m128i a; 385void x(unsigned short n) { 386 a = _mm_slli_epi32 (a, n); 387} 388void y(unsigned n) { 389 a = _mm_slli_epi32 (a, n); 390} 391 392compile to ( -O3 -static -fomit-frame-pointer): 393_x: 394 movzwl 4(%esp), %eax 395 movd %eax, %xmm0 396 movaps _a, %xmm1 397 pslld %xmm0, %xmm1 398 movaps %xmm1, _a 399 ret 400_y: 401 movd 4(%esp), %xmm0 402 movaps _a, %xmm1 403 pslld %xmm0, %xmm1 404 movaps %xmm1, _a 405 ret 406 407"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems 408like movd would be sufficient in both cases as the value is already zero 409extended in the 32-bit stack slot IIRC. For signed short, it should also be 410save, as a really-signed value would be undefined for pslld. 411 412 413//===---------------------------------------------------------------------===// 414 415#include <math.h> 416int t1(double d) { return signbit(d); } 417 418This currently compiles to: 419 subl $12, %esp 420 movsd 16(%esp), %xmm0 421 movsd %xmm0, (%esp) 422 movl 4(%esp), %eax 423 shrl $31, %eax 424 addl $12, %esp 425 ret 426 427We should use movmskp{s|d} instead. 428 429//===---------------------------------------------------------------------===// 430 431CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single 432(aligned) vector load. This functionality has a couple of problems. 433 4341. The code to infer alignment from loads of globals is in the X86 backend, 435 not the dag combiner. This is because dagcombine2 needs to be able to see 436 through the X86ISD::Wrapper node, which DAGCombine can't really do. 4372. The code for turning 4 x load into a single vector load is target 438 independent and should be moved to the dag combiner. 4393. The code for turning 4 x load into a vector load can only handle a direct 440 load from a global or a direct load from the stack. It should be generalized 441 to handle any load from P, P+4, P+8, P+12, where P can be anything. 4424. The alignment inference code cannot handle loads from globals in non-static 443 mode because it doesn't look through the extra dyld stub load. If you try 444 vec_align.ll without -relocation-model=static, you'll see what I mean. 445 446//===---------------------------------------------------------------------===// 447 448We should lower store(fneg(load p), q) into an integer load+xor+store, which 449eliminates a constant pool load. For example, consider: 450 451define i64 @ccosf(float %z.0, float %z.1) nounwind readonly { 452entry: 453 %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1] 454 %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly 455 ret i64 %tmp20 456} 457declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly 458 459This currently compiles to: 460 461LCPI1_0: # <4 x float> 462 .long 2147483648 # float -0 463 .long 2147483648 # float -0 464 .long 2147483648 # float -0 465 .long 2147483648 # float -0 466_ccosf: 467 subl $12, %esp 468 movss 16(%esp), %xmm0 469 movss %xmm0, 4(%esp) 470 movss 20(%esp), %xmm0 471 xorps LCPI1_0, %xmm0 472 movss %xmm0, (%esp) 473 call L_ccoshf$stub 474 addl $12, %esp 475 ret 476 477Note the load into xmm0, then xor (to negate), then store. In PIC mode, 478this code computes the pic base and does two loads to do the constant pool 479load, so the improvement is much bigger. 480 481The tricky part about this xform is that the argument load/store isn't exposed 482until post-legalize, and at that point, the fneg has been custom expanded into 483an X86 fxor. This means that we need to handle this case in the x86 backend 484instead of in target independent code. 485 486//===---------------------------------------------------------------------===// 487 488Non-SSE4 insert into 16 x i8 is atrociously bad. 489 490//===---------------------------------------------------------------------===// 491 492<2 x i64> extract is substantially worse than <2 x f64>, even if the destination 493is memory. 494 495//===---------------------------------------------------------------------===// 496 497SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext 498sitting between the truncate and the extract. 499 500//===---------------------------------------------------------------------===// 501 502INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert 503any number of 0.0 simultaneously. Currently we only use it for simple 504insertions. 505 506See comments in LowerINSERT_VECTOR_ELT_SSE4. 507 508//===---------------------------------------------------------------------===// 509 510On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not 511Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are 512legal, it'll just take a few extra patterns written in the .td file. 513 514Note: this is not a code quality issue; the custom lowered code happens to be 515right, but we shouldn't have to custom lower anything. This is probably related 516to <2 x i64> ops being so bad. 517 518//===---------------------------------------------------------------------===// 519 520'select' on vectors and scalars could be a whole lot better. We currently 521lower them to conditional branches. On x86-64 for example, we compile this: 522 523double test(double a, double b, double c, double d) { return a<b ? c : d; } 524 525to: 526 527_test: 528 ucomisd %xmm0, %xmm1 529 ja LBB1_2 # entry 530LBB1_1: # entry 531 movapd %xmm3, %xmm2 532LBB1_2: # entry 533 movapd %xmm2, %xmm0 534 ret 535 536instead of: 537 538_test: 539 cmpltsd %xmm1, %xmm0 540 andpd %xmm0, %xmm2 541 andnpd %xmm3, %xmm0 542 orpd %xmm2, %xmm0 543 ret 544 545For unpredictable branches, the later is much more efficient. This should 546just be a matter of having scalar sse map to SELECT_CC and custom expanding 547or iseling it. 548 549//===---------------------------------------------------------------------===// 550 551LLVM currently generates stack realignment code, when it is not necessary 552needed. The problem is that we need to know about stack alignment too early, 553before RA runs. 554 555At that point we don't know, whether there will be vector spill, or not. 556Stack realignment logic is overly conservative here, but otherwise we can 557produce unaligned loads/stores. 558 559Fixing this will require some huge RA changes. 560 561Testcase: 562#include <emmintrin.h> 563 564typedef short vSInt16 __attribute__ ((__vector_size__ (16))); 565 566static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873, 567- 22725, - 12873};; 568 569vSInt16 madd(vSInt16 b) 570{ 571 return _mm_madd_epi16(a, b); 572} 573 574Generated code (x86-32, linux): 575madd: 576 pushl %ebp 577 movl %esp, %ebp 578 andl $-16, %esp 579 movaps .LCPI1_0, %xmm1 580 pmaddwd %xmm1, %xmm0 581 movl %ebp, %esp 582 popl %ebp 583 ret 584 585//===---------------------------------------------------------------------===// 586 587Consider: 588#include <emmintrin.h> 589__m128 foo2 (float x) { 590 return _mm_set_ps (0, 0, x, 0); 591} 592 593In x86-32 mode, we generate this spiffy code: 594 595_foo2: 596 movss 4(%esp), %xmm0 597 pshufd $81, %xmm0, %xmm0 598 ret 599 600in x86-64 mode, we generate this code, which could be better: 601 602_foo2: 603 xorps %xmm1, %xmm1 604 movss %xmm0, %xmm1 605 pshufd $81, %xmm1, %xmm0 606 ret 607 608In sse4 mode, we could use insertps to make both better. 609 610Here's another testcase that could use insertps [mem]: 611 612#include <xmmintrin.h> 613extern float x2, x3; 614__m128 foo1 (float x1, float x4) { 615 return _mm_set_ps (x2, x1, x3, x4); 616} 617 618gcc mainline compiles it to: 619 620foo1: 621 insertps $0x10, x2(%rip), %xmm0 622 insertps $0x10, x3(%rip), %xmm1 623 movaps %xmm1, %xmm2 624 movlhps %xmm0, %xmm2 625 movaps %xmm2, %xmm0 626 ret 627 628//===---------------------------------------------------------------------===// 629 630We compile vector multiply-by-constant into poor code: 631 632define <4 x i32> @f(<4 x i32> %i) nounwind { 633 %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 > 634 ret <4 x i32> %A 635} 636 637On targets without SSE4.1, this compiles into: 638 639LCPI1_0: ## <4 x i32> 640 .long 10 641 .long 10 642 .long 10 643 .long 10 644 .text 645 .align 4,0x90 646 .globl _f 647_f: 648 pshufd $3, %xmm0, %xmm1 649 movd %xmm1, %eax 650 imull LCPI1_0+12, %eax 651 movd %eax, %xmm1 652 pshufd $1, %xmm0, %xmm2 653 movd %xmm2, %eax 654 imull LCPI1_0+4, %eax 655 movd %eax, %xmm2 656 punpckldq %xmm1, %xmm2 657 movd %xmm0, %eax 658 imull LCPI1_0, %eax 659 movd %eax, %xmm1 660 movhlps %xmm0, %xmm0 661 movd %xmm0, %eax 662 imull LCPI1_0+8, %eax 663 movd %eax, %xmm0 664 punpckldq %xmm0, %xmm1 665 movaps %xmm1, %xmm0 666 punpckldq %xmm2, %xmm0 667 ret 668 669It would be better to synthesize integer vector multiplication by constants 670using shifts and adds, pslld and paddd here. And even on targets with SSE4.1, 671simple cases such as multiplication by powers of two would be better as 672vector shifts than as multiplications. 673 674//===---------------------------------------------------------------------===// 675 676We compile this: 677 678__m128i 679foo2 (char x) 680{ 681 return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0); 682} 683 684into: 685 movl $1, %eax 686 xorps %xmm0, %xmm0 687 pinsrw $2, %eax, %xmm0 688 movzbl 4(%esp), %eax 689 pinsrw $3, %eax, %xmm0 690 movl $256, %eax 691 pinsrw $7, %eax, %xmm0 692 ret 693 694 695gcc-4.2: 696 subl $12, %esp 697 movzbl 16(%esp), %eax 698 movdqa LC0, %xmm0 699 pinsrw $3, %eax, %xmm0 700 addl $12, %esp 701 ret 702 .const 703 .align 4 704LC0: 705 .word 0 706 .word 0 707 .word 1 708 .word 0 709 .word 0 710 .word 0 711 .word 0 712 .word 256 713 714With SSE4, it should be 715 movdqa .LC0(%rip), %xmm0 716 pinsrb $6, %edi, %xmm0 717 718//===---------------------------------------------------------------------===// 719 720We should transform a shuffle of two vectors of constants into a single vector 721of constants. Also, insertelement of a constant into a vector of constants 722should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll. 723 724We compiled it to something horrible: 725 726 .align 4 727LCPI1_1: ## float 728 .long 1065353216 ## float 1 729 .const 730 731 .align 4 732LCPI1_0: ## <4 x float> 733 .space 4 734 .long 1065353216 ## float 1 735 .space 4 736 .long 1065353216 ## float 1 737 .text 738 .align 4,0x90 739 .globl _t 740_t: 741 xorps %xmm0, %xmm0 742 movhps LCPI1_0, %xmm0 743 movss LCPI1_1, %xmm1 744 movaps %xmm0, %xmm2 745 shufps $2, %xmm1, %xmm2 746 shufps $132, %xmm2, %xmm0 747 movaps %xmm0, 0 748 749//===---------------------------------------------------------------------===// 750rdar://5907648 751 752This function: 753 754float foo(unsigned char x) { 755 return x; 756} 757 758compiles to (x86-32): 759 760define float @foo(i8 zeroext %x) nounwind { 761 %tmp12 = uitofp i8 %x to float ; <float> [#uses=1] 762 ret float %tmp12 763} 764 765compiles to: 766 767_foo: 768 subl $4, %esp 769 movzbl 8(%esp), %eax 770 cvtsi2ss %eax, %xmm0 771 movss %xmm0, (%esp) 772 flds (%esp) 773 addl $4, %esp 774 ret 775 776We should be able to use: 777 cvtsi2ss 8($esp), %xmm0 778since we know the stack slot is already zext'd. 779 780//===---------------------------------------------------------------------===// 781 782Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64)) 783when code size is critical. movlps is slower than movsd on core2 but it's one 784byte shorter. 785 786//===---------------------------------------------------------------------===// 787 788We should use a dynamic programming based approach to tell when using FPStack 789operations is cheaper than SSE. SciMark montecarlo contains code like this 790for example: 791 792double MonteCarlo_num_flops(int Num_samples) { 793 return ((double) Num_samples)* 4.0; 794} 795 796In fpstack mode, this compiles into: 797 798LCPI1_0: 799 .long 1082130432 ## float 4.000000e+00 800_MonteCarlo_num_flops: 801 subl $4, %esp 802 movl 8(%esp), %eax 803 movl %eax, (%esp) 804 fildl (%esp) 805 fmuls LCPI1_0 806 addl $4, %esp 807 ret 808 809in SSE mode, it compiles into significantly slower code: 810 811_MonteCarlo_num_flops: 812 subl $12, %esp 813 cvtsi2sd 16(%esp), %xmm0 814 mulsd LCPI1_0, %xmm0 815 movsd %xmm0, (%esp) 816 fldl (%esp) 817 addl $12, %esp 818 ret 819 820There are also other cases in scimark where using fpstack is better, it is 821cheaper to do fld1 than load from a constant pool for example, so 822"load, add 1.0, store" is better done in the fp stack, etc. 823 824//===---------------------------------------------------------------------===// 825 826The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to 827"cmpsd". For example, this code: 828 829double d1(double x) { return x == x ? x : x + x; } 830 831Compiles into: 832 833_d1: 834 ucomisd %xmm0, %xmm0 835 jnp LBB1_2 836 addsd %xmm0, %xmm0 837 ret 838LBB1_2: 839 ret 840 841Also, the 'ret's should be shared. This is PR6032. 842 843//===---------------------------------------------------------------------===// 844 845These should compile into the same code (PR6214): Perhaps instcombine should 846canonicalize the former into the later? 847 848define float @foo(float %x) nounwind { 849 %t = bitcast float %x to i32 850 %s = and i32 %t, 2147483647 851 %d = bitcast i32 %s to float 852 ret float %d 853} 854 855declare float @fabsf(float %n) 856define float @bar(float %x) nounwind { 857 %d = call float @fabsf(float %x) 858 ret float %d 859} 860 861//===---------------------------------------------------------------------===// 862 863This IR (from PR6194): 864 865target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 866target triple = "x86_64-apple-darwin10.0.0" 867 868%0 = type { double, double } 869%struct.float3 = type { float, float, float } 870 871define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp { 872entry: 873 %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1] 874 %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1] 875 %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1] 876 %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1] 877 %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1] 878 %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1] 879 %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1] 880 store float %tmp12, float* %tmp5 881 ret void 882} 883 884Compiles to: 885 886_test: ## @test 887 movd %xmm0, %rax 888 shrq $32, %rax 889 movl %eax, 4(%rdi) 890 ret 891 892This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and 893doing a shuffle from v[1] to v[0] then a float store. 894 895//===---------------------------------------------------------------------===// 896 897On SSE4 machines, we compile this code: 898 899define <2 x float> @test2(<2 x float> %Q, <2 x float> %R, 900 <2 x float> *%P) nounwind { 901 %Z = fadd <2 x float> %Q, %R 902 903 store <2 x float> %Z, <2 x float> *%P 904 ret <2 x float> %Z 905} 906 907into: 908 909_test2: ## @test2 910## BB#0: 911 insertps $0, %xmm2, %xmm2 912 insertps $16, %xmm3, %xmm2 913 insertps $0, %xmm0, %xmm3 914 insertps $16, %xmm1, %xmm3 915 addps %xmm2, %xmm3 916 movq %xmm3, (%rdi) 917 movaps %xmm3, %xmm0 918 pshufd $1, %xmm3, %xmm1 919 ## kill: XMM1<def> XMM1<kill> 920 ret 921 922The insertps's of $0 are pointless complex copies. 923 924//===---------------------------------------------------------------------===// 925 926If SSE4.1 is available we should inline rounding functions instead of emitting 927a libcall. 928 929floor: roundsd $0x01, %xmm, %xmm 930ceil: roundsd $0x02, %xmm, %xmm 931 932and likewise for the single precision versions. 933 934Currently, SelectionDAGBuilder doesn't turn calls to these functions into the 935corresponding nodes and some targets (including X86) aren't ready for them. 936 937//===---------------------------------------------------------------------===// 938