1//===- README.txt - Notes for improving PowerPC-specific code gen ---------===// 2 3TODO: 4* gpr0 allocation 5* implement do-loop -> bdnz transform 6* lmw/stmw pass a la arm load store optimizer for prolog/epilog 7 8===-------------------------------------------------------------------------=== 9 10On PPC64, this: 11 12long f2 (long x) { return 0xfffffff000000000UL; } 13long f3 (long x) { return 0x1ffffffffUL; } 14 15could compile into: 16 17_f2: 18 li r3,-1 19 rldicr r3,r3,0,27 20 blr 21_f3: 22 li r3,-1 23 rldicl r3,r3,0,31 24 blr 25 26we produce: 27 28_f2: 29 lis r2, 4095 30 ori r2, r2, 65535 31 sldi r3, r2, 36 32 blr 33_f3: 34 li r2, 1 35 sldi r2, r2, 32 36 oris r2, r2, 65535 37 ori r3, r2, 65535 38 blr 39 40===-------------------------------------------------------------------------=== 41 42This code: 43 44unsigned add32carry(unsigned sum, unsigned x) { 45 unsigned z = sum + x; 46 if (sum + x < x) 47 z++; 48 return z; 49} 50 51Should compile to something like: 52 53 addc r3,r3,r4 54 addze r3,r3 55 56instead we get: 57 58 add r3, r4, r3 59 cmplw cr7, r3, r4 60 mfcr r4 ; 1 61 rlwinm r4, r4, 29, 31, 31 62 add r3, r3, r4 63 64Ick. 65 66===-------------------------------------------------------------------------=== 67 68Support 'update' load/store instructions. These are cracked on the G5, but are 69still a codesize win. 70 71With preinc enabled, this: 72 73long *%test4(long *%X, long *%dest) { 74 %Y = getelementptr long* %X, int 4 75 %A = load long* %Y 76 store long %A, long* %dest 77 ret long* %Y 78} 79 80compiles to: 81 82_test4: 83 mr r2, r3 84 lwzu r5, 32(r2) 85 lwz r3, 36(r3) 86 stw r5, 0(r4) 87 stw r3, 4(r4) 88 mr r3, r2 89 blr 90 91with -sched=list-burr, I get: 92 93_test4: 94 lwz r2, 36(r3) 95 lwzu r5, 32(r3) 96 stw r2, 4(r4) 97 stw r5, 0(r4) 98 blr 99 100===-------------------------------------------------------------------------=== 101 102We compile the hottest inner loop of viterbi to: 103 104 li r6, 0 105 b LBB1_84 ;bb432.i 106LBB1_83: ;bb420.i 107 lbzx r8, r5, r7 108 addi r6, r7, 1 109 stbx r8, r4, r7 110LBB1_84: ;bb432.i 111 mr r7, r6 112 cmplwi cr0, r7, 143 113 bne cr0, LBB1_83 ;bb420.i 114 115The CBE manages to produce: 116 117 li r0, 143 118 mtctr r0 119loop: 120 lbzx r2, r2, r11 121 stbx r0, r2, r9 122 addi r2, r2, 1 123 bdz later 124 b loop 125 126This could be much better (bdnz instead of bdz) but it still beats us. If we 127produced this with bdnz, the loop would be a single dispatch group. 128 129===-------------------------------------------------------------------------=== 130 131Compile: 132 133void foo(int *P) { 134 if (P) *P = 0; 135} 136 137into: 138 139_foo: 140 cmpwi cr0,r3,0 141 beqlr cr0 142 li r0,0 143 stw r0,0(r3) 144 blr 145 146This is effectively a simple form of predication. 147 148===-------------------------------------------------------------------------=== 149 150Lump the constant pool for each function into ONE pic object, and reference 151pieces of it as offsets from the start. For functions like this (contrived 152to have lots of constants obviously): 153 154double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; } 155 156We generate: 157 158_X: 159 lis r2, ha16(.CPI_X_0) 160 lfd f0, lo16(.CPI_X_0)(r2) 161 lis r2, ha16(.CPI_X_1) 162 lfd f2, lo16(.CPI_X_1)(r2) 163 fmadd f0, f1, f0, f2 164 lis r2, ha16(.CPI_X_2) 165 lfd f1, lo16(.CPI_X_2)(r2) 166 lis r2, ha16(.CPI_X_3) 167 lfd f2, lo16(.CPI_X_3)(r2) 168 fmadd f1, f0, f1, f2 169 blr 170 171It would be better to materialize .CPI_X into a register, then use immediates 172off of the register to avoid the lis's. This is even more important in PIC 173mode. 174 175Note that this (and the static variable version) is discussed here for GCC: 176http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html 177 178Here's another example (the sgn function): 179double testf(double a) { 180 return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0); 181} 182 183it produces a BB like this: 184LBB1_1: ; cond_true 185 lis r2, ha16(LCPI1_0) 186 lfs f0, lo16(LCPI1_0)(r2) 187 lis r2, ha16(LCPI1_1) 188 lis r3, ha16(LCPI1_2) 189 lfs f2, lo16(LCPI1_2)(r3) 190 lfs f3, lo16(LCPI1_1)(r2) 191 fsub f0, f0, f1 192 fsel f1, f0, f2, f3 193 blr 194 195===-------------------------------------------------------------------------=== 196 197PIC Code Gen IPO optimization: 198 199Squish small scalar globals together into a single global struct, allowing the 200address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size 201of the GOT on targets with one). 202 203Note that this is discussed here for GCC: 204http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html 205 206===-------------------------------------------------------------------------=== 207 208Implement Newton-Rhapson method for improving estimate instructions to the 209correct accuracy, and implementing divide as multiply by reciprocal when it has 210more than one use. Itanium would want this too. 211 212===-------------------------------------------------------------------------=== 213 214Compile offsets from allocas: 215 216int *%test() { 217 %X = alloca { int, int } 218 %Y = getelementptr {int,int}* %X, int 0, uint 1 219 ret int* %Y 220} 221 222into a single add, not two: 223 224_test: 225 addi r2, r1, -8 226 addi r3, r2, 4 227 blr 228 229--> important for C++. 230 231===-------------------------------------------------------------------------=== 232 233No loads or stores of the constants should be needed: 234 235struct foo { double X, Y; }; 236void xxx(struct foo F); 237void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); } 238 239===-------------------------------------------------------------------------=== 240 241Darwin Stub removal: 242 243We still generate calls to foo$stub, and stubs, on Darwin. This is not 244necessary when building with the Leopard (10.5) or later linker, as stubs are 245generated by ld when necessary. Parameterizing this based on the deployment 246target (-mmacosx-version-min) is probably enough. x86-32 does this right, see 247its logic. 248 249===-------------------------------------------------------------------------=== 250 251Darwin Stub LICM optimization: 252 253Loops like this: 254 255 for (...) bar(); 256 257Have to go through an indirect stub if bar is external or linkonce. It would 258be better to compile it as: 259 260 fp = &bar; 261 for (...) fp(); 262 263which only computes the address of bar once (instead of each time through the 264stub). This is Darwin specific and would have to be done in the code generator. 265Probably not a win on x86. 266 267===-------------------------------------------------------------------------=== 268 269Simple IPO for argument passing, change: 270 void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y) 271 272the Darwin ABI specifies that any integer arguments in the first 32 bytes worth 273of arguments get assigned to r3 through r10. That is, if you have a function 274foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the 275argument bytes for r4 and r5. The trick then would be to shuffle the argument 276order for functions we can internalize so that the maximum number of 277integers/pointers get passed in regs before you see any of the fp arguments. 278 279Instead of implementing this, it would actually probably be easier to just 280implement a PPC fastcc, where we could do whatever we wanted to the CC, 281including having this work sanely. 282 283===-------------------------------------------------------------------------=== 284 285Fix Darwin FP-In-Integer Registers ABI 286 287Darwin passes doubles in structures in integer registers, which is very very 288bad. Add something like a BITCAST to LLVM, then do an i-p transformation that 289percolates these things out of functions. 290 291Check out how horrible this is: 292http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html 293 294This is an extension of "interprocedural CC unmunging" that can't be done with 295just fastcc. 296 297===-------------------------------------------------------------------------=== 298 299Compile this: 300 301int foo(int a) { 302 int b = (a < 8); 303 if (b) { 304 return b * 3; // ignore the fact that this is always 3. 305 } else { 306 return 2; 307 } 308} 309 310into something not this: 311 312_foo: 3131) cmpwi cr7, r3, 8 314 mfcr r2, 1 315 rlwinm r2, r2, 29, 31, 31 3161) cmpwi cr0, r3, 7 317 bgt cr0, LBB1_2 ; UnifiedReturnBlock 318LBB1_1: ; then 319 rlwinm r2, r2, 0, 31, 31 320 mulli r3, r2, 3 321 blr 322LBB1_2: ; UnifiedReturnBlock 323 li r3, 2 324 blr 325 326In particular, the two compares (marked 1) could be shared by reversing one. 327This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the 328same operands (but backwards) exists. In this case, this wouldn't save us 329anything though, because the compares still wouldn't be shared. 330 331===-------------------------------------------------------------------------=== 332 333We should custom expand setcc instead of pretending that we have it. That 334would allow us to expose the access of the crbit after the mfcr, allowing 335that access to be trivially folded into other ops. A simple example: 336 337int foo(int a, int b) { return (a < b) << 4; } 338 339compiles into: 340 341_foo: 342 cmpw cr7, r3, r4 343 mfcr r2, 1 344 rlwinm r2, r2, 29, 31, 31 345 slwi r3, r2, 4 346 blr 347 348===-------------------------------------------------------------------------=== 349 350Fold add and sub with constant into non-extern, non-weak addresses so this: 351 352static int a; 353void bar(int b) { a = b; } 354void foo(unsigned char *c) { 355 *c = a; 356} 357 358So that 359 360_foo: 361 lis r2, ha16(_a) 362 la r2, lo16(_a)(r2) 363 lbz r2, 3(r2) 364 stb r2, 0(r3) 365 blr 366 367Becomes 368 369_foo: 370 lis r2, ha16(_a+3) 371 lbz r2, lo16(_a+3)(r2) 372 stb r2, 0(r3) 373 blr 374 375===-------------------------------------------------------------------------=== 376 377We generate really bad code for this: 378 379int f(signed char *a, _Bool b, _Bool c) { 380 signed char t = 0; 381 if (b) t = *a; 382 if (c) *a = t; 383} 384 385===-------------------------------------------------------------------------=== 386 387This: 388int test(unsigned *P) { return *P >> 24; } 389 390Should compile to: 391 392_test: 393 lbz r3,0(r3) 394 blr 395 396not: 397 398_test: 399 lwz r2, 0(r3) 400 srwi r3, r2, 24 401 blr 402 403===-------------------------------------------------------------------------=== 404 405On the G5, logical CR operations are more expensive in their three 406address form: ops that read/write the same register are half as expensive as 407those that read from two registers that are different from their destination. 408 409We should model this with two separate instructions. The isel should generate 410the "two address" form of the instructions. When the register allocator 411detects that it needs to insert a copy due to the two-addresness of the CR 412logical op, it will invoke PPCInstrInfo::convertToThreeAddress. At this point 413we can convert to the "three address" instruction, to save code space. 414 415This only matters when we start generating cr logical ops. 416 417===-------------------------------------------------------------------------=== 418 419We should compile these two functions to the same thing: 420 421#include <stdlib.h> 422void f(int a, int b, int *P) { 423 *P = (a-b)>=0?(a-b):(b-a); 424} 425void g(int a, int b, int *P) { 426 *P = abs(a-b); 427} 428 429Further, they should compile to something better than: 430 431_g: 432 subf r2, r4, r3 433 subfic r3, r2, 0 434 cmpwi cr0, r2, -1 435 bgt cr0, LBB2_2 ; entry 436LBB2_1: ; entry 437 mr r2, r3 438LBB2_2: ; entry 439 stw r2, 0(r5) 440 blr 441 442GCC produces: 443 444_g: 445 subf r4,r4,r3 446 srawi r2,r4,31 447 xor r0,r2,r4 448 subf r0,r2,r0 449 stw r0,0(r5) 450 blr 451 452... which is much nicer. 453 454This theoretically may help improve twolf slightly (used in dimbox.c:142?). 455 456===-------------------------------------------------------------------------=== 457 458PR5945: This: 459define i32 @clamp0g(i32 %a) { 460entry: 461 %cmp = icmp slt i32 %a, 0 462 %sel = select i1 %cmp, i32 0, i32 %a 463 ret i32 %sel 464} 465 466Is compile to this with the PowerPC (32-bit) backend: 467 468_clamp0g: 469 cmpwi cr0, r3, 0 470 li r2, 0 471 blt cr0, LBB1_2 472; BB#1: ; %entry 473 mr r2, r3 474LBB1_2: ; %entry 475 mr r3, r2 476 blr 477 478This could be reduced to the much simpler: 479 480_clamp0g: 481 srawi r2, r3, 31 482 andc r3, r3, r2 483 blr 484 485===-------------------------------------------------------------------------=== 486 487int foo(int N, int ***W, int **TK, int X) { 488 int t, i; 489 490 for (t = 0; t < N; ++t) 491 for (i = 0; i < 4; ++i) 492 W[t / X][i][t % X] = TK[i][t]; 493 494 return 5; 495} 496 497We generate relatively atrocious code for this loop compared to gcc. 498 499We could also strength reduce the rem and the div: 500http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf 501 502===-------------------------------------------------------------------------=== 503 504float foo(float X) { return (int)(X); } 505 506Currently produces: 507 508_foo: 509 fctiwz f0, f1 510 stfd f0, -8(r1) 511 lwz r2, -4(r1) 512 extsw r2, r2 513 std r2, -16(r1) 514 lfd f0, -16(r1) 515 fcfid f0, f0 516 frsp f1, f0 517 blr 518 519We could use a target dag combine to turn the lwz/extsw into an lwa when the 520lwz has a single use. Since LWA is cracked anyway, this would be a codesize 521win only. 522 523===-------------------------------------------------------------------------=== 524 525We generate ugly code for this: 526 527void func(unsigned int *ret, float dx, float dy, float dz, float dw) { 528 unsigned code = 0; 529 if(dx < -dw) code |= 1; 530 if(dx > dw) code |= 2; 531 if(dy < -dw) code |= 4; 532 if(dy > dw) code |= 8; 533 if(dz < -dw) code |= 16; 534 if(dz > dw) code |= 32; 535 *ret = code; 536} 537 538===-------------------------------------------------------------------------=== 539 540Complete the signed i32 to FP conversion code using 64-bit registers 541transformation, good for PI. See PPCISelLowering.cpp, this comment: 542 543 // FIXME: disable this lowered code. This generates 64-bit register values, 544 // and we don't model the fact that the top part is clobbered by calls. We 545 // need to flag these together so that the value isn't live across a call. 546 //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 547 548Also, if the registers are spilled to the stack, we have to ensure that all 54964-bits of them are save/restored, otherwise we will miscompile the code. It 550sounds like we need to get the 64-bit register classes going. 551 552===-------------------------------------------------------------------------=== 553 554%struct.B = type { i8, [3 x i8] } 555 556define void @bar(%struct.B* %b) { 557entry: 558 %tmp = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1] 559 %tmp = load i32* %tmp ; <uint> [#uses=1] 560 %tmp3 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1] 561 %tmp4 = load i32* %tmp3 ; <uint> [#uses=1] 562 %tmp8 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=2] 563 %tmp9 = load i32* %tmp8 ; <uint> [#uses=1] 564 %tmp4.mask17 = shl i32 %tmp4, i8 1 ; <uint> [#uses=1] 565 %tmp1415 = and i32 %tmp4.mask17, 2147483648 ; <uint> [#uses=1] 566 %tmp.masked = and i32 %tmp, 2147483648 ; <uint> [#uses=1] 567 %tmp11 = or i32 %tmp1415, %tmp.masked ; <uint> [#uses=1] 568 %tmp12 = and i32 %tmp9, 2147483647 ; <uint> [#uses=1] 569 %tmp13 = or i32 %tmp12, %tmp11 ; <uint> [#uses=1] 570 store i32 %tmp13, i32* %tmp8 571 ret void 572} 573 574We emit: 575 576_foo: 577 lwz r2, 0(r3) 578 slwi r4, r2, 1 579 or r4, r4, r2 580 rlwimi r2, r4, 0, 0, 0 581 stw r2, 0(r3) 582 blr 583 584We could collapse a bunch of those ORs and ANDs and generate the following 585equivalent code: 586 587_foo: 588 lwz r2, 0(r3) 589 rlwinm r4, r2, 1, 0, 0 590 or r2, r2, r4 591 stw r2, 0(r3) 592 blr 593 594===-------------------------------------------------------------------------=== 595 596We compile: 597 598unsigned test6(unsigned x) { 599 return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16); 600} 601 602into: 603 604_test6: 605 lis r2, 255 606 rlwinm r3, r3, 16, 0, 31 607 ori r2, r2, 255 608 and r3, r3, r2 609 blr 610 611GCC gets it down to: 612 613_test6: 614 rlwinm r0,r3,16,8,15 615 rlwinm r3,r3,16,24,31 616 or r3,r3,r0 617 blr 618 619 620===-------------------------------------------------------------------------=== 621 622Consider a function like this: 623 624float foo(float X) { return X + 1234.4123f; } 625 626The FP constant ends up in the constant pool, so we need to get the LR register. 627 This ends up producing code like this: 628 629_foo: 630.LBB_foo_0: ; entry 631 mflr r11 632*** stw r11, 8(r1) 633 bl "L00000$pb" 634"L00000$pb": 635 mflr r2 636 addis r2, r2, ha16(.CPI_foo_0-"L00000$pb") 637 lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2) 638 fadds f1, f1, f0 639*** lwz r11, 8(r1) 640 mtlr r11 641 blr 642 643This is functional, but there is no reason to spill the LR register all the way 644to the stack (the two marked instrs): spilling it to a GPR is quite enough. 645 646Implementing this will require some codegen improvements. Nate writes: 647 648"So basically what we need to support the "no stack frame save and restore" is a 649generalization of the LR optimization to "callee-save regs". 650 651Currently, we have LR marked as a callee-save reg. The register allocator sees 652that it's callee save, and spills it directly to the stack. 653 654Ideally, something like this would happen: 655 656LR would be in a separate register class from the GPRs. The class of LR would be 657marked "unspillable". When the register allocator came across an unspillable 658reg, it would ask "what is the best class to copy this into that I *can* spill" 659If it gets a class back, which it will in this case (the gprs), it grabs a free 660register of that class. If it is then later necessary to spill that reg, so be 661it. 662 663===-------------------------------------------------------------------------=== 664 665We compile this: 666int test(_Bool X) { 667 return X ? 524288 : 0; 668} 669 670to: 671_test: 672 cmplwi cr0, r3, 0 673 lis r2, 8 674 li r3, 0 675 beq cr0, LBB1_2 ;entry 676LBB1_1: ;entry 677 mr r3, r2 678LBB1_2: ;entry 679 blr 680 681instead of: 682_test: 683 addic r2,r3,-1 684 subfe r0,r2,r3 685 slwi r3,r0,19 686 blr 687 688This sort of thing occurs a lot due to globalopt. 689 690===-------------------------------------------------------------------------=== 691 692We compile: 693 694define i32 @bar(i32 %x) nounwind readnone ssp { 695entry: 696 %0 = icmp eq i32 %x, 0 ; <i1> [#uses=1] 697 %neg = sext i1 %0 to i32 ; <i32> [#uses=1] 698 ret i32 %neg 699} 700 701to: 702 703_bar: 704 cntlzw r2, r3 705 slwi r2, r2, 26 706 srawi r3, r2, 31 707 blr 708 709it would be better to produce: 710 711_bar: 712 addic r3,r3,-1 713 subfe r3,r3,r3 714 blr 715 716===-------------------------------------------------------------------------=== 717 718We currently compile 32-bit bswap: 719 720declare i32 @llvm.bswap.i32(i32 %A) 721define i32 @test(i32 %A) { 722 %B = call i32 @llvm.bswap.i32(i32 %A) 723 ret i32 %B 724} 725 726to: 727 728_test: 729 rlwinm r2, r3, 24, 16, 23 730 slwi r4, r3, 24 731 rlwimi r2, r3, 8, 24, 31 732 rlwimi r4, r3, 8, 8, 15 733 rlwimi r4, r2, 0, 16, 31 734 mr r3, r4 735 blr 736 737it would be more efficient to produce: 738 739_foo: mr r0,r3 740 rlwinm r3,r3,8,0xffffffff 741 rlwimi r3,r0,24,0,7 742 rlwimi r3,r0,24,16,23 743 blr 744 745===-------------------------------------------------------------------------=== 746 747test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to: 748 749__ZNK4llvm5APInt17countLeadingZerosEv: 750 ld r2, 0(r3) 751 cntlzd r2, r2 752 or r2, r2, r2 <<-- silly. 753 addi r3, r2, -64 754 blr 755 756The dead or is a 'truncate' from 64- to 32-bits. 757 758===-------------------------------------------------------------------------=== 759 760We generate horrible ppc code for this: 761 762#define N 2000000 763double a[N],c[N]; 764void simpleloop() { 765 int j; 766 for (j=0; j<N; j++) 767 c[j] = a[j]; 768} 769 770LBB1_1: ;bb 771 lfdx f0, r3, r4 772 addi r5, r5, 1 ;; Extra IV for the exit value compare. 773 stfdx f0, r2, r4 774 addi r4, r4, 8 775 776 xoris r6, r5, 30 ;; This is due to a large immediate. 777 cmplwi cr0, r6, 33920 778 bne cr0, LBB1_1 779 780//===---------------------------------------------------------------------===// 781 782This: 783 #include <algorithm> 784 inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b) 785 { return std::make_pair(a + b, a + b < a); } 786 bool no_overflow(unsigned a, unsigned b) 787 { return !full_add(a, b).second; } 788 789Should compile to: 790 791__Z11no_overflowjj: 792 add r4,r3,r4 793 subfc r3,r3,r4 794 li r3,0 795 adde r3,r3,r3 796 blr 797 798(or better) not: 799 800__Z11no_overflowjj: 801 add r2, r4, r3 802 cmplw cr7, r2, r3 803 mfcr r2 804 rlwinm r2, r2, 29, 31, 31 805 xori r3, r2, 1 806 blr 807 808//===---------------------------------------------------------------------===// 809 810We compile some FP comparisons into an mfcr with two rlwinms and an or. For 811example: 812#include <math.h> 813int test(double x, double y) { return islessequal(x, y);} 814int test2(double x, double y) { return islessgreater(x, y);} 815int test3(double x, double y) { return !islessequal(x, y);} 816 817Compiles into (all three are similar, but the bits differ): 818 819_test: 820 fcmpu cr7, f1, f2 821 mfcr r2 822 rlwinm r3, r2, 29, 31, 31 823 rlwinm r2, r2, 31, 31, 31 824 or r3, r2, r3 825 blr 826 827GCC compiles this into: 828 829 _test: 830 fcmpu cr7,f1,f2 831 cror 30,28,30 832 mfcr r3 833 rlwinm r3,r3,31,1 834 blr 835 836which is more efficient and can use mfocr. See PR642 for some more context. 837 838//===---------------------------------------------------------------------===// 839 840void foo(float *data, float d) { 841 long i; 842 for (i = 0; i < 8000; i++) 843 data[i] = d; 844} 845void foo2(float *data, float d) { 846 long i; 847 data--; 848 for (i = 0; i < 8000; i++) { 849 data[1] = d; 850 data++; 851 } 852} 853 854These compile to: 855 856_foo: 857 li r2, 0 858LBB1_1: ; bb 859 addi r4, r2, 4 860 stfsx f1, r3, r2 861 cmplwi cr0, r4, 32000 862 mr r2, r4 863 bne cr0, LBB1_1 ; bb 864 blr 865_foo2: 866 li r2, 0 867LBB2_1: ; bb 868 addi r4, r2, 4 869 stfsx f1, r3, r2 870 cmplwi cr0, r4, 32000 871 mr r2, r4 872 bne cr0, LBB2_1 ; bb 873 blr 874 875The 'mr' could be eliminated to folding the add into the cmp better. 876 877//===---------------------------------------------------------------------===// 878Codegen for the following (low-probability) case deteriorated considerably 879when the correctness fixes for unordered comparisons went in (PR 642, 58871). 880It should be possible to recover the code quality described in the comments. 881 882; RUN: llvm-as < %s | llc -march=ppc32 | grep or | count 3 883; This should produce one 'or' or 'cror' instruction per function. 884 885; RUN: llvm-as < %s | llc -march=ppc32 | grep mfcr | count 3 886; PR2964 887 888define i32 @test(double %x, double %y) nounwind { 889entry: 890 %tmp3 = fcmp ole double %x, %y ; <i1> [#uses=1] 891 %tmp345 = zext i1 %tmp3 to i32 ; <i32> [#uses=1] 892 ret i32 %tmp345 893} 894 895define i32 @test2(double %x, double %y) nounwind { 896entry: 897 %tmp3 = fcmp one double %x, %y ; <i1> [#uses=1] 898 %tmp345 = zext i1 %tmp3 to i32 ; <i32> [#uses=1] 899 ret i32 %tmp345 900} 901 902define i32 @test3(double %x, double %y) nounwind { 903entry: 904 %tmp3 = fcmp ugt double %x, %y ; <i1> [#uses=1] 905 %tmp34 = zext i1 %tmp3 to i32 ; <i32> [#uses=1] 906 ret i32 %tmp34 907} 908//===----------------------------------------------------------------------===// 909; RUN: llvm-as < %s | llc -march=ppc32 | not grep fneg 910 911; This could generate FSEL with appropriate flags (FSEL is not IEEE-safe, and 912; should not be generated except with -enable-finite-only-fp-math or the like). 913; With the correctness fixes for PR642 (58871) LowerSELECT_CC would need to 914; recognize a more elaborate tree than a simple SETxx. 915 916define double @test_FNEG_sel(double %A, double %B, double %C) { 917 %D = fsub double -0.000000e+00, %A ; <double> [#uses=1] 918 %Cond = fcmp ugt double %D, -0.000000e+00 ; <i1> [#uses=1] 919 %E = select i1 %Cond, double %B, double %C ; <double> [#uses=1] 920 ret double %E 921} 922 923//===----------------------------------------------------------------------===// 924The save/restore sequence for CR in prolog/epilog is terrible: 925- Each CR subreg is saved individually, rather than doing one save as a unit. 926- On Darwin, the save is done after the decrement of SP, which means the offset 927from SP of the save slot can be too big for a store instruction, which means we 928need an additional register (currently hacked in 96015+96020; the solution there 929is correct, but poor). 930- On SVR4 the same thing can happen, and I don't think saving before the SP 931decrement is safe on that target, as there is no red zone. This is currently 932broken AFAIK, although it's not a target I can exercise. 933The following demonstrates the problem: 934extern void bar(char *p); 935void foo() { 936 char x[100000]; 937 bar(x); 938 __asm__("" ::: "cr2"); 939} 940