1/* 2Copyright (c) 2011, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#ifndef L 32# define L(label) .L##label 33#endif 34 35#ifndef cfi_startproc 36# define cfi_startproc .cfi_startproc 37#endif 38 39#ifndef cfi_endproc 40# define cfi_endproc .cfi_endproc 41#endif 42 43#ifndef cfi_rel_offset 44# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 45#endif 46 47#ifndef cfi_restore 48# define cfi_restore(reg) .cfi_restore reg 49#endif 50 51#ifndef cfi_adjust_cfa_offset 52# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 53#endif 54 55#ifndef ENTRY 56# define ENTRY(name) \ 57 .type name, @function; \ 58 .globl name; \ 59 .p2align 4; \ 60name: \ 61 cfi_startproc 62#endif 63 64#ifndef END 65# define END(name) \ 66 cfi_endproc; \ 67 .size name, .-name 68#endif 69 70#define CFI_PUSH(REG) \ 71 cfi_adjust_cfa_offset (4); \ 72 cfi_rel_offset (REG, 0) 73 74#define CFI_POP(REG) \ 75 cfi_adjust_cfa_offset (-4); \ 76 cfi_restore (REG) 77 78#define PUSH(REG) pushl REG; CFI_PUSH (REG) 79#define POP(REG) popl REG; CFI_POP (REG) 80 81#define PARMS 4 82#define STR1 PARMS 83#define STR2 STR1+4 84#define LEN STR2+4 85 86 .text 87ENTRY (memrchr) 88 mov STR1(%esp), %ecx 89 movd STR2(%esp), %xmm1 90 mov LEN(%esp), %edx 91 92 test %edx, %edx 93 jz L(return_null) 94 sub $16, %edx 95 jbe L(length_less16) 96 97 punpcklbw %xmm1, %xmm1 98 add %edx, %ecx 99 punpcklbw %xmm1, %xmm1 100 101 movdqu (%ecx), %xmm0 102 pshufd $0, %xmm1, %xmm1 103 pcmpeqb %xmm1, %xmm0 104 105 pmovmskb %xmm0, %eax 106 test %eax, %eax 107 jnz L(exit_dispatch) 108 109 sub $64, %ecx 110 mov %ecx, %eax 111 and $15, %eax 112 jz L(loop_prolog) 113 114 add $16, %ecx 115 add $16, %edx 116 and $-16, %ecx 117 sub %eax, %edx 118 119 .p2align 4 120/* Loop start on aligned string. */ 121L(loop_prolog): 122 sub $64, %edx 123 jbe L(exit_loop) 124 125 movdqa 48(%ecx), %xmm0 126 pcmpeqb %xmm1, %xmm0 127 pmovmskb %xmm0, %eax 128 test %eax, %eax 129 jnz L(matches48) 130 131 movdqa 32(%ecx), %xmm2 132 pcmpeqb %xmm1, %xmm2 133 pmovmskb %xmm2, %eax 134 test %eax, %eax 135 jnz L(matches32) 136 137 movdqa 16(%ecx), %xmm3 138 pcmpeqb %xmm1, %xmm3 139 pmovmskb %xmm3, %eax 140 test %eax, %eax 141 jnz L(matches16) 142 143 movdqa (%ecx), %xmm4 144 pcmpeqb %xmm1, %xmm4 145 pmovmskb %xmm4, %eax 146 test %eax, %eax 147 jnz L(exit_dispatch) 148 149 sub $64, %ecx 150 sub $64, %edx 151 jbe L(exit_loop) 152 153 movdqa 48(%ecx), %xmm0 154 pcmpeqb %xmm1, %xmm0 155 pmovmskb %xmm0, %eax 156 test %eax, %eax 157 jnz L(matches48) 158 159 movdqa 32(%ecx), %xmm2 160 pcmpeqb %xmm1, %xmm2 161 pmovmskb %xmm2, %eax 162 test %eax, %eax 163 jnz L(matches32) 164 165 movdqa 16(%ecx), %xmm3 166 pcmpeqb %xmm1, %xmm3 167 pmovmskb %xmm3, %eax 168 test %eax, %eax 169 jnz L(matches16) 170 171 movdqa (%ecx), %xmm3 172 pcmpeqb %xmm1, %xmm3 173 pmovmskb %xmm3, %eax 174 test %eax, %eax 175 jnz L(exit_dispatch) 176 177 mov %ecx, %eax 178 and $63, %eax 179 test %eax, %eax 180 jz L(align64_loop) 181 182 add $64, %ecx 183 add $64, %edx 184 and $-64, %ecx 185 sub %eax, %edx 186 187 .p2align 4 188L(align64_loop): 189 sub $64, %ecx 190 sub $64, %edx 191 jbe L(exit_loop) 192 193 movdqa (%ecx), %xmm0 194 movdqa 16(%ecx), %xmm2 195 movdqa 32(%ecx), %xmm3 196 movdqa 48(%ecx), %xmm4 197 198 pcmpeqb %xmm1, %xmm0 199 pcmpeqb %xmm1, %xmm2 200 pcmpeqb %xmm1, %xmm3 201 pcmpeqb %xmm1, %xmm4 202 203 pmaxub %xmm3, %xmm0 204 pmaxub %xmm4, %xmm2 205 pmaxub %xmm0, %xmm2 206 pmovmskb %xmm2, %eax 207 208 test %eax, %eax 209 jz L(align64_loop) 210 211 pmovmskb %xmm4, %eax 212 test %eax, %eax 213 jnz L(matches48) 214 215 pmovmskb %xmm3, %eax 216 test %eax, %eax 217 jnz L(matches32) 218 219 movdqa 16(%ecx), %xmm2 220 221 pcmpeqb %xmm1, %xmm2 222 pcmpeqb (%ecx), %xmm1 223 224 pmovmskb %xmm2, %eax 225 test %eax, %eax 226 jnz L(matches16) 227 228 pmovmskb %xmm1, %eax 229 test %ah, %ah 230 jnz L(exit_dispatch_high) 231 mov %al, %dl 232 and $15 << 4, %dl 233 jnz L(exit_dispatch_8) 234 test $0x08, %al 235 jnz L(exit_4) 236 test $0x04, %al 237 jnz L(exit_3) 238 test $0x02, %al 239 jnz L(exit_2) 240 mov %ecx, %eax 241 ret 242 243 .p2align 4 244L(exit_loop): 245 add $64, %edx 246 cmp $32, %edx 247 jbe L(exit_loop_32) 248 249 movdqa 48(%ecx), %xmm0 250 pcmpeqb %xmm1, %xmm0 251 pmovmskb %xmm0, %eax 252 test %eax, %eax 253 jnz L(matches48) 254 255 movdqa 32(%ecx), %xmm2 256 pcmpeqb %xmm1, %xmm2 257 pmovmskb %xmm2, %eax 258 test %eax, %eax 259 jnz L(matches32) 260 261 movdqa 16(%ecx), %xmm3 262 pcmpeqb %xmm1, %xmm3 263 pmovmskb %xmm3, %eax 264 test %eax, %eax 265 jnz L(matches16_1) 266 cmp $48, %edx 267 jbe L(return_null) 268 269 pcmpeqb (%ecx), %xmm1 270 pmovmskb %xmm1, %eax 271 test %eax, %eax 272 jnz L(matches0_1) 273 xor %eax, %eax 274 ret 275 276 .p2align 4 277L(exit_loop_32): 278 movdqa 48(%ecx), %xmm0 279 pcmpeqb %xmm1, %xmm0 280 pmovmskb %xmm0, %eax 281 test %eax, %eax 282 jnz L(matches48_1) 283 cmp $16, %edx 284 jbe L(return_null) 285 286 pcmpeqb 32(%ecx), %xmm1 287 pmovmskb %xmm1, %eax 288 test %eax, %eax 289 jnz L(matches32_1) 290 xor %eax, %eax 291 ret 292 293 .p2align 4 294L(matches16): 295 lea 16(%ecx), %ecx 296 test %ah, %ah 297 jnz L(exit_dispatch_high) 298 mov %al, %dl 299 and $15 << 4, %dl 300 jnz L(exit_dispatch_8) 301 test $0x08, %al 302 jnz L(exit_4) 303 test $0x04, %al 304 jnz L(exit_3) 305 test $0x02, %al 306 jnz L(exit_2) 307 mov %ecx, %eax 308 ret 309 310 .p2align 4 311L(matches32): 312 lea 32(%ecx), %ecx 313 test %ah, %ah 314 jnz L(exit_dispatch_high) 315 mov %al, %dl 316 and $15 << 4, %dl 317 jnz L(exit_dispatch_8) 318 test $0x08, %al 319 jnz L(exit_4) 320 test $0x04, %al 321 jnz L(exit_3) 322 test $0x02, %al 323 jnz L(exit_2) 324 mov %ecx, %eax 325 ret 326 327 .p2align 4 328L(matches48): 329 lea 48(%ecx), %ecx 330 331 .p2align 4 332L(exit_dispatch): 333 test %ah, %ah 334 jnz L(exit_dispatch_high) 335 mov %al, %dl 336 and $15 << 4, %dl 337 jnz L(exit_dispatch_8) 338 test $0x08, %al 339 jnz L(exit_4) 340 test $0x04, %al 341 jnz L(exit_3) 342 test $0x02, %al 343 jnz L(exit_2) 344 mov %ecx, %eax 345 ret 346 347 .p2align 4 348L(exit_dispatch_8): 349 test $0x80, %al 350 jnz L(exit_8) 351 test $0x40, %al 352 jnz L(exit_7) 353 test $0x20, %al 354 jnz L(exit_6) 355 lea 4(%ecx), %eax 356 ret 357 358 .p2align 4 359L(exit_dispatch_high): 360 mov %ah, %dh 361 and $15 << 4, %dh 362 jnz L(exit_dispatch_high_8) 363 test $0x08, %ah 364 jnz L(exit_12) 365 test $0x04, %ah 366 jnz L(exit_11) 367 test $0x02, %ah 368 jnz L(exit_10) 369 lea 8(%ecx), %eax 370 ret 371 372 .p2align 4 373L(exit_dispatch_high_8): 374 test $0x80, %ah 375 jnz L(exit_16) 376 test $0x40, %ah 377 jnz L(exit_15) 378 test $0x20, %ah 379 jnz L(exit_14) 380 lea 12(%ecx), %eax 381 ret 382 383 .p2align 4 384L(exit_2): 385 lea 1(%ecx), %eax 386 ret 387 388 .p2align 4 389L(exit_3): 390 lea 2(%ecx), %eax 391 ret 392 393 .p2align 4 394L(exit_4): 395 lea 3(%ecx), %eax 396 ret 397 398 .p2align 4 399L(exit_6): 400 lea 5(%ecx), %eax 401 ret 402 403 .p2align 4 404L(exit_7): 405 lea 6(%ecx), %eax 406 ret 407 408 .p2align 4 409L(exit_8): 410 lea 7(%ecx), %eax 411 ret 412 413 .p2align 4 414L(exit_10): 415 lea 9(%ecx), %eax 416 ret 417 418 .p2align 4 419L(exit_11): 420 lea 10(%ecx), %eax 421 ret 422 423 .p2align 4 424L(exit_12): 425 lea 11(%ecx), %eax 426 ret 427 428 .p2align 4 429L(exit_14): 430 lea 13(%ecx), %eax 431 ret 432 433 .p2align 4 434L(exit_15): 435 lea 14(%ecx), %eax 436 ret 437 438 .p2align 4 439L(exit_16): 440 lea 15(%ecx), %eax 441 ret 442 443 .p2align 4 444L(matches0_1): 445 lea -64(%edx), %edx 446 447 test %ah, %ah 448 jnz L(exit_dispatch_1_high) 449 mov %al, %ah 450 and $15 << 4, %ah 451 jnz L(exit_dispatch_1_8) 452 test $0x08, %al 453 jnz L(exit_1_4) 454 test $0x04, %al 455 jnz L(exit_1_3) 456 test $0x02, %al 457 jnz L(exit_1_2) 458 459 add $0, %edx 460 jl L(return_null) 461 mov %ecx, %eax 462 ret 463 464 .p2align 4 465L(matches16_1): 466 lea -48(%edx), %edx 467 lea 16(%ecx), %ecx 468 469 test %ah, %ah 470 jnz L(exit_dispatch_1_high) 471 mov %al, %ah 472 and $15 << 4, %ah 473 jnz L(exit_dispatch_1_8) 474 test $0x08, %al 475 jnz L(exit_1_4) 476 test $0x04, %al 477 jnz L(exit_1_3) 478 test $0x02, %al 479 jnz L(exit_1_2) 480 481 add $0, %edx 482 jl L(return_null) 483 mov %ecx, %eax 484 ret 485 486 .p2align 4 487L(matches32_1): 488 lea -32(%edx), %edx 489 lea 32(%ecx), %ecx 490 491 test %ah, %ah 492 jnz L(exit_dispatch_1_high) 493 mov %al, %ah 494 and $15 << 4, %ah 495 jnz L(exit_dispatch_1_8) 496 test $0x08, %al 497 jnz L(exit_1_4) 498 test $0x04, %al 499 jnz L(exit_1_3) 500 test $0x02, %al 501 jnz L(exit_1_2) 502 503 add $0, %edx 504 jl L(return_null) 505 mov %ecx, %eax 506 ret 507 508 .p2align 4 509L(matches48_1): 510 lea -16(%edx), %edx 511 lea 48(%ecx), %ecx 512 513 .p2align 4 514L(exit_dispatch_1): 515 test %ah, %ah 516 jnz L(exit_dispatch_1_high) 517 mov %al, %ah 518 and $15 << 4, %ah 519 jnz L(exit_dispatch_1_8) 520 test $0x08, %al 521 jnz L(exit_1_4) 522 test $0x04, %al 523 jnz L(exit_1_3) 524 test $0x02, %al 525 jnz L(exit_1_2) 526 527 add $0, %edx 528 jl L(return_null) 529 mov %ecx, %eax 530 ret 531 532 .p2align 4 533L(exit_dispatch_1_8): 534 test $0x80, %al 535 jnz L(exit_1_8) 536 test $0x40, %al 537 jnz L(exit_1_7) 538 test $0x20, %al 539 jnz L(exit_1_6) 540 541 add $4, %edx 542 jl L(return_null) 543 lea 4(%ecx), %eax 544 ret 545 546 .p2align 4 547L(exit_dispatch_1_high): 548 mov %ah, %al 549 and $15 << 4, %al 550 jnz L(exit_dispatch_1_high_8) 551 test $0x08, %ah 552 jnz L(exit_1_12) 553 test $0x04, %ah 554 jnz L(exit_1_11) 555 test $0x02, %ah 556 jnz L(exit_1_10) 557 558 add $8, %edx 559 jl L(return_null) 560 lea 8(%ecx), %eax 561 ret 562 563 .p2align 4 564L(exit_dispatch_1_high_8): 565 test $0x80, %ah 566 jnz L(exit_1_16) 567 test $0x40, %ah 568 jnz L(exit_1_15) 569 test $0x20, %ah 570 jnz L(exit_1_14) 571 572 add $12, %edx 573 jl L(return_null) 574 lea 12(%ecx), %eax 575 ret 576 577 .p2align 4 578L(exit_1_2): 579 add $1, %edx 580 jl L(return_null) 581 lea 1(%ecx), %eax 582 ret 583 584 .p2align 4 585L(exit_1_3): 586 add $2, %edx 587 jl L(return_null) 588 lea 2(%ecx), %eax 589 ret 590 591 .p2align 4 592L(exit_1_4): 593 add $3, %edx 594 jl L(return_null) 595 lea 3(%ecx), %eax 596 ret 597 598 .p2align 4 599L(exit_1_6): 600 add $5, %edx 601 jl L(return_null) 602 lea 5(%ecx), %eax 603 ret 604 605 .p2align 4 606L(exit_1_7): 607 add $6, %edx 608 jl L(return_null) 609 lea 6(%ecx), %eax 610 ret 611 612 .p2align 4 613L(exit_1_8): 614 add $7, %edx 615 jl L(return_null) 616 lea 7(%ecx), %eax 617 ret 618 619 .p2align 4 620L(exit_1_10): 621 add $9, %edx 622 jl L(return_null) 623 lea 9(%ecx), %eax 624 ret 625 626 .p2align 4 627L(exit_1_11): 628 add $10, %edx 629 jl L(return_null) 630 lea 10(%ecx), %eax 631 ret 632 633 .p2align 4 634L(exit_1_12): 635 add $11, %edx 636 jl L(return_null) 637 lea 11(%ecx), %eax 638 ret 639 640 .p2align 4 641L(exit_1_14): 642 add $13, %edx 643 jl L(return_null) 644 lea 13(%ecx), %eax 645 ret 646 647 .p2align 4 648L(exit_1_15): 649 add $14, %edx 650 jl L(return_null) 651 lea 14(%ecx), %eax 652 ret 653 654 .p2align 4 655L(exit_1_16): 656 add $15, %edx 657 jl L(return_null) 658 lea 15(%ecx), %eax 659 ret 660 661 .p2align 4 662L(return_null): 663 xor %eax, %eax 664 ret 665 666 .p2align 4 667L(length_less16_offset0): 668 mov %dl, %cl 669 pcmpeqb (%eax), %xmm1 670 671 mov $1, %edx 672 sal %cl, %edx 673 sub $1, %edx 674 675 mov %eax, %ecx 676 pmovmskb %xmm1, %eax 677 678 and %edx, %eax 679 test %eax, %eax 680 jnz L(exit_dispatch) 681 682 xor %eax, %eax 683 ret 684 685 .p2align 4 686L(length_less16): 687 punpcklbw %xmm1, %xmm1 688 add $16, %edx 689 punpcklbw %xmm1, %xmm1 690 691 mov %ecx, %eax 692 pshufd $0, %xmm1, %xmm1 693 694 and $15, %ecx 695 jz L(length_less16_offset0) 696 697 PUSH (%edi) 698 699 mov %cl, %dh 700 add %dl, %dh 701 and $-16, %eax 702 703 sub $16, %dh 704 ja L(length_less16_part2) 705 706 pcmpeqb (%eax), %xmm1 707 pmovmskb %xmm1, %edi 708 709 sar %cl, %edi 710 add %ecx, %eax 711 mov %dl, %cl 712 713 mov $1, %edx 714 sal %cl, %edx 715 sub $1, %edx 716 717 and %edx, %edi 718 test %edi, %edi 719 jz L(ret_null) 720 721 bsr %edi, %edi 722 add %edi, %eax 723 POP (%edi) 724 ret 725 726 CFI_PUSH (%edi) 727 728 .p2align 4 729L(length_less16_part2): 730 movdqa 16(%eax), %xmm2 731 pcmpeqb %xmm1, %xmm2 732 pmovmskb %xmm2, %edi 733 734 mov %cl, %ch 735 736 mov %dh, %cl 737 mov $1, %edx 738 sal %cl, %edx 739 sub $1, %edx 740 741 and %edx, %edi 742 743 test %edi, %edi 744 jnz L(length_less16_part2_return) 745 746 pcmpeqb (%eax), %xmm1 747 pmovmskb %xmm1, %edi 748 749 mov %ch, %cl 750 sar %cl, %edi 751 test %edi, %edi 752 jz L(ret_null) 753 754 bsr %edi, %edi 755 add %edi, %eax 756 xor %ch, %ch 757 add %ecx, %eax 758 POP (%edi) 759 ret 760 761 CFI_PUSH (%edi) 762 763 .p2align 4 764L(length_less16_part2_return): 765 bsr %edi, %edi 766 lea 16(%eax, %edi), %eax 767 POP (%edi) 768 ret 769 770 CFI_PUSH (%edi) 771 772 .p2align 4 773L(ret_null): 774 xor %eax, %eax 775 POP (%edi) 776 ret 777 778END (memrchr) 779