1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "asm_support_x86.S" 18 19#define MEMCMP __memcmp16 20 21/* int32_t memcmp16_compare(const uint16_t* s0, const uint16_t* s1, size_t count); */ 22 23#ifndef L 24# define L(label) .L##label 25#endif 26 27#define CFI_PUSH(REG) \ 28 CFI_ADJUST_CFA_OFFSET(4); \ 29 CFI_REL_OFFSET(REG, 0) 30 31#define CFI_POP(REG) \ 32 CFI_ADJUST_CFA_OFFSET(-4); \ 33 CFI_RESTORE(REG) 34 35#define PUSH(REG) pushl REG; CFI_PUSH (REG) 36#define POP(REG) popl REG; CFI_POP (REG) 37 38#define PARMS 4 39#define BLK1 PARMS 40#define BLK2 BLK1+4 41#define LEN BLK2+4 42#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret 43MACRO0(RETURN) 44 RETURN_END 45 CFI_RESTORE_STATE_AND_DEF_CFA esp, 16 46 CFI_REMEMBER_STATE 47END_MACRO 48 49DEFINE_FUNCTION MEMCMP 50 movl LEN(%esp), %ecx 51 52 shl $1, %ecx 53 jz L(zero) 54 55 movl BLK1(%esp), %eax 56 cmp $48, %ecx 57 movl BLK2(%esp), %edx 58 jae L(48bytesormore) 59 60 PUSH (%ebx) 61 add %ecx, %edx 62 add %ecx, %eax 63 jmp L(less48bytes) 64 65 CFI_POP (%ebx) 66 67 .p2align 4 68L(zero): 69 xor %eax, %eax 70 ret 71 72 .p2align 4 73L(48bytesormore): 74 PUSH (%ebx) 75 PUSH (%esi) 76 PUSH (%edi) 77 CFI_REMEMBER_STATE 78 movdqu (%eax), %xmm3 79 movdqu (%edx), %xmm0 80 movl %eax, %edi 81 movl %edx, %esi 82 pcmpeqb %xmm0, %xmm3 83 pmovmskb %xmm3, %edx 84 lea 16(%edi), %edi 85 86 sub $0xffff, %edx 87 lea 16(%esi), %esi 88 jnz L(less16bytes) 89 mov %edi, %edx 90 and $0xf, %edx 91 xor %edx, %edi 92 sub %edx, %esi 93 add %edx, %ecx 94 mov %esi, %edx 95 and $0xf, %edx 96 jz L(shr_0) 97 xor %edx, %esi 98 99 cmp $0, %edx 100 je L(shr_0) 101 cmp $2, %edx 102 je L(shr_2) 103 cmp $4, %edx 104 je L(shr_4) 105 cmp $6, %edx 106 je L(shr_6) 107 cmp $8, %edx 108 je L(shr_8) 109 cmp $10, %edx 110 je L(shr_10) 111 cmp $12, %edx 112 je L(shr_12) 113 jmp L(shr_14) 114 115 .p2align 4 116L(shr_0): 117 cmp $80, %ecx 118 jae L(shr_0_gobble) 119 lea -48(%ecx), %ecx 120 xor %eax, %eax 121 movaps (%esi), %xmm1 122 pcmpeqb (%edi), %xmm1 123 movaps 16(%esi), %xmm2 124 pcmpeqb 16(%edi), %xmm2 125 pand %xmm1, %xmm2 126 pmovmskb %xmm2, %edx 127 add $32, %edi 128 add $32, %esi 129 sub $0xffff, %edx 130 jnz L(exit) 131 132 lea (%ecx, %edi,1), %eax 133 lea (%ecx, %esi,1), %edx 134 POP (%edi) 135 POP (%esi) 136 jmp L(less48bytes) 137 138 CFI_RESTORE_STATE_AND_DEF_CFA esp, 16 139 CFI_REMEMBER_STATE 140 .p2align 4 141L(shr_0_gobble): 142 lea -48(%ecx), %ecx 143 movdqa (%esi), %xmm0 144 xor %eax, %eax 145 pcmpeqb (%edi), %xmm0 146 sub $32, %ecx 147 movdqa 16(%esi), %xmm2 148 pcmpeqb 16(%edi), %xmm2 149L(shr_0_gobble_loop): 150 pand %xmm0, %xmm2 151 sub $32, %ecx 152 pmovmskb %xmm2, %edx 153 movdqa %xmm0, %xmm1 154 movdqa 32(%esi), %xmm0 155 movdqa 48(%esi), %xmm2 156 sbb $0xffff, %edx 157 pcmpeqb 32(%edi), %xmm0 158 pcmpeqb 48(%edi), %xmm2 159 lea 32(%edi), %edi 160 lea 32(%esi), %esi 161 jz L(shr_0_gobble_loop) 162 163 pand %xmm0, %xmm2 164 cmp $0, %ecx 165 jge L(shr_0_gobble_loop_next) 166 inc %edx 167 add $32, %ecx 168L(shr_0_gobble_loop_next): 169 test %edx, %edx 170 jnz L(exit) 171 172 pmovmskb %xmm2, %edx 173 movdqa %xmm0, %xmm1 174 lea 32(%edi), %edi 175 lea 32(%esi), %esi 176 sub $0xffff, %edx 177 jnz L(exit) 178 lea (%ecx, %edi,1), %eax 179 lea (%ecx, %esi,1), %edx 180 POP (%edi) 181 POP (%esi) 182 jmp L(less48bytes) 183 184 CFI_RESTORE_STATE_AND_DEF_CFA esp, 16 185 CFI_REMEMBER_STATE 186 .p2align 4 187L(shr_2): 188 cmp $80, %ecx 189 lea -48(%ecx), %ecx 190 mov %edx, %eax 191 jae L(shr_2_gobble) 192 193 movdqa 16(%esi), %xmm1 194 movdqa %xmm1, %xmm2 195 palignr $2,(%esi), %xmm1 196 pcmpeqb (%edi), %xmm1 197 198 movdqa 32(%esi), %xmm3 199 palignr $2,%xmm2, %xmm3 200 pcmpeqb 16(%edi), %xmm3 201 202 pand %xmm1, %xmm3 203 pmovmskb %xmm3, %edx 204 lea 32(%edi), %edi 205 lea 32(%esi), %esi 206 sub $0xffff, %edx 207 jnz L(exit) 208 lea (%ecx, %edi,1), %eax 209 lea 2(%ecx, %esi,1), %edx 210 POP (%edi) 211 POP (%esi) 212 jmp L(less48bytes) 213 214 CFI_RESTORE_STATE_AND_DEF_CFA esp, 16 215 CFI_REMEMBER_STATE 216 .p2align 4 217L(shr_2_gobble): 218 sub $32, %ecx 219 movdqa 16(%esi), %xmm0 220 palignr $2,(%esi), %xmm0 221 pcmpeqb (%edi), %xmm0 222 223 movdqa 32(%esi), %xmm3 224 palignr $2,16(%esi), %xmm3 225 pcmpeqb 16(%edi), %xmm3 226 227L(shr_2_gobble_loop): 228 pand %xmm0, %xmm3 229 sub $32, %ecx 230 pmovmskb %xmm3, %edx 231 movdqa %xmm0, %xmm1 232 233 movdqa 64(%esi), %xmm3 234 palignr $2,48(%esi), %xmm3 235 sbb $0xffff, %edx 236 movdqa 48(%esi), %xmm0 237 palignr $2,32(%esi), %xmm0 238 pcmpeqb 32(%edi), %xmm0 239 lea 32(%esi), %esi 240 pcmpeqb 48(%edi), %xmm3 241 242 lea 32(%edi), %edi 243 jz L(shr_2_gobble_loop) 244 pand %xmm0, %xmm3 245 246 cmp $0, %ecx 247 jge L(shr_2_gobble_next) 248 inc %edx 249 add $32, %ecx 250L(shr_2_gobble_next): 251 test %edx, %edx 252 jnz L(exit) 253 254 pmovmskb %xmm3, %edx 255 movdqa %xmm0, %xmm1 256 lea 32(%edi), %edi 257 lea 32(%esi), %esi 258 sub $0xffff, %edx 259 jnz L(exit) 260 261 lea (%ecx, %edi,1), %eax 262 lea 2(%ecx, %esi,1), %edx 263 POP (%edi) 264 POP (%esi) 265 jmp L(less48bytes) 266 267 CFI_RESTORE_STATE_AND_DEF_CFA esp, 16 268 CFI_REMEMBER_STATE 269 .p2align 4 270L(shr_4): 271 cmp $80, %ecx 272 lea -48(%ecx), %ecx 273 mov %edx, %eax 274 jae L(shr_4_gobble) 275 276 movdqa 16(%esi), %xmm1 277 movdqa %xmm1, %xmm2 278 palignr $4,(%esi), %xmm1 279 pcmpeqb (%edi), %xmm1 280 281 movdqa 32(%esi), %xmm3 282 palignr $4,%xmm2, %xmm3 283 pcmpeqb 16(%edi), %xmm3 284 285 pand %xmm1, %xmm3 286 pmovmskb %xmm3, %edx 287 lea 32(%edi), %edi 288 lea 32(%esi), %esi 289 sub $0xffff, %edx 290 jnz L(exit) 291 lea (%ecx, %edi,1), %eax 292 lea 4(%ecx, %esi,1), %edx 293 POP (%edi) 294 POP (%esi) 295 jmp L(less48bytes) 296 297 CFI_RESTORE_STATE_AND_DEF_CFA esp, 16 298 CFI_REMEMBER_STATE 299 .p2align 4 300L(shr_4_gobble): 301 sub $32, %ecx 302 movdqa 16(%esi), %xmm0 303 palignr $4,(%esi), %xmm0 304 pcmpeqb (%edi), %xmm0 305 306 movdqa 32(%esi), %xmm3 307 palignr $4,16(%esi), %xmm3 308 pcmpeqb 16(%edi), %xmm3 309 310L(shr_4_gobble_loop): 311 pand %xmm0, %xmm3 312 sub $32, %ecx 313 pmovmskb %xmm3, %edx 314 movdqa %xmm0, %xmm1 315 316 movdqa 64(%esi), %xmm3 317 palignr $4,48(%esi), %xmm3 318 sbb $0xffff, %edx 319 movdqa 48(%esi), %xmm0 320 palignr $4,32(%esi), %xmm0 321 pcmpeqb 32(%edi), %xmm0 322 lea 32(%esi), %esi 323 pcmpeqb 48(%edi), %xmm3 324 325 lea 32(%edi), %edi 326 jz L(shr_4_gobble_loop) 327 pand %xmm0, %xmm3 328 329 cmp $0, %ecx 330 jge L(shr_4_gobble_next) 331 inc %edx 332 add $32, %ecx 333L(shr_4_gobble_next): 334 test %edx, %edx 335 jnz L(exit) 336 337 pmovmskb %xmm3, %edx 338 movdqa %xmm0, %xmm1 339 lea 32(%edi), %edi 340 lea 32(%esi), %esi 341 sub $0xffff, %edx 342 jnz L(exit) 343 344 lea (%ecx, %edi,1), %eax 345 lea 4(%ecx, %esi,1), %edx 346 POP (%edi) 347 POP (%esi) 348 jmp L(less48bytes) 349 350 CFI_RESTORE_STATE_AND_DEF_CFA esp, 16 351 CFI_REMEMBER_STATE 352 .p2align 4 353L(shr_6): 354 cmp $80, %ecx 355 lea -48(%ecx), %ecx 356 mov %edx, %eax 357 jae L(shr_6_gobble) 358 359 movdqa 16(%esi), %xmm1 360 movdqa %xmm1, %xmm2 361 palignr $6,(%esi), %xmm1 362 pcmpeqb (%edi), %xmm1 363 364 movdqa 32(%esi), %xmm3 365 palignr $6,%xmm2, %xmm3 366 pcmpeqb 16(%edi), %xmm3 367 368 pand %xmm1, %xmm3 369 pmovmskb %xmm3, %edx 370 lea 32(%edi), %edi 371 lea 32(%esi), %esi 372 sub $0xffff, %edx 373 jnz L(exit) 374 lea (%ecx, %edi,1), %eax 375 lea 6(%ecx, %esi,1), %edx 376 POP (%edi) 377 POP (%esi) 378 jmp L(less48bytes) 379 380 CFI_RESTORE_STATE_AND_DEF_CFA esp, 16 381 CFI_REMEMBER_STATE 382 .p2align 4 383L(shr_6_gobble): 384 sub $32, %ecx 385 movdqa 16(%esi), %xmm0 386 palignr $6,(%esi), %xmm0 387 pcmpeqb (%edi), %xmm0 388 389 movdqa 32(%esi), %xmm3 390 palignr $6,16(%esi), %xmm3 391 pcmpeqb 16(%edi), %xmm3 392 393L(shr_6_gobble_loop): 394 pand %xmm0, %xmm3 395 sub $32, %ecx 396 pmovmskb %xmm3, %edx 397 movdqa %xmm0, %xmm1 398 399 movdqa 64(%esi), %xmm3 400 palignr $6,48(%esi), %xmm3 401 sbb $0xffff, %edx 402 movdqa 48(%esi), %xmm0 403 palignr $6,32(%esi), %xmm0 404 pcmpeqb 32(%edi), %xmm0 405 lea 32(%esi), %esi 406 pcmpeqb 48(%edi), %xmm3 407 408 lea 32(%edi), %edi 409 jz L(shr_6_gobble_loop) 410 pand %xmm0, %xmm3 411 412 cmp $0, %ecx 413 jge L(shr_6_gobble_next) 414 inc %edx 415 add $32, %ecx 416L(shr_6_gobble_next): 417 test %edx, %edx 418 jnz L(exit) 419 420 pmovmskb %xmm3, %edx 421 movdqa %xmm0, %xmm1 422 lea 32(%edi), %edi 423 lea 32(%esi), %esi 424 sub $0xffff, %edx 425 jnz L(exit) 426 427 lea (%ecx, %edi,1), %eax 428 lea 6(%ecx, %esi,1), %edx 429 POP (%edi) 430 POP (%esi) 431 jmp L(less48bytes) 432 433 CFI_RESTORE_STATE_AND_DEF_CFA esp, 16 434 CFI_REMEMBER_STATE 435 .p2align 4 436L(shr_8): 437 cmp $80, %ecx 438 lea -48(%ecx), %ecx 439 mov %edx, %eax 440 jae L(shr_8_gobble) 441 442 movdqa 16(%esi), %xmm1 443 movdqa %xmm1, %xmm2 444 palignr $8,(%esi), %xmm1 445 pcmpeqb (%edi), %xmm1 446 447 movdqa 32(%esi), %xmm3 448 palignr $8,%xmm2, %xmm3 449 pcmpeqb 16(%edi), %xmm3 450 451 pand %xmm1, %xmm3 452 pmovmskb %xmm3, %edx 453 lea 32(%edi), %edi 454 lea 32(%esi), %esi 455 sub $0xffff, %edx 456 jnz L(exit) 457 lea (%ecx, %edi,1), %eax 458 lea 8(%ecx, %esi,1), %edx 459 POP (%edi) 460 POP (%esi) 461 jmp L(less48bytes) 462 463 CFI_RESTORE_STATE_AND_DEF_CFA esp, 16 464 CFI_REMEMBER_STATE 465 .p2align 4 466L(shr_8_gobble): 467 sub $32, %ecx 468 movdqa 16(%esi), %xmm0 469 palignr $8,(%esi), %xmm0 470 pcmpeqb (%edi), %xmm0 471 472 movdqa 32(%esi), %xmm3 473 palignr $8,16(%esi), %xmm3 474 pcmpeqb 16(%edi), %xmm3 475 476L(shr_8_gobble_loop): 477 pand %xmm0, %xmm3 478 sub $32, %ecx 479 pmovmskb %xmm3, %edx 480 movdqa %xmm0, %xmm1 481 482 movdqa 64(%esi), %xmm3 483 palignr $8,48(%esi), %xmm3 484 sbb $0xffff, %edx 485 movdqa 48(%esi), %xmm0 486 palignr $8,32(%esi), %xmm0 487 pcmpeqb 32(%edi), %xmm0 488 lea 32(%esi), %esi 489 pcmpeqb 48(%edi), %xmm3 490 491 lea 32(%edi), %edi 492 jz L(shr_8_gobble_loop) 493 pand %xmm0, %xmm3 494 495 cmp $0, %ecx 496 jge L(shr_8_gobble_next) 497 inc %edx 498 add $32, %ecx 499L(shr_8_gobble_next): 500 test %edx, %edx 501 jnz L(exit) 502 503 pmovmskb %xmm3, %edx 504 movdqa %xmm0, %xmm1 505 lea 32(%edi), %edi 506 lea 32(%esi), %esi 507 sub $0xffff, %edx 508 jnz L(exit) 509 510 lea (%ecx, %edi,1), %eax 511 lea 8(%ecx, %esi,1), %edx 512 POP (%edi) 513 POP (%esi) 514 jmp L(less48bytes) 515 516 CFI_RESTORE_STATE_AND_DEF_CFA esp, 16 517 CFI_REMEMBER_STATE 518 .p2align 4 519L(shr_10): 520 cmp $80, %ecx 521 lea -48(%ecx), %ecx 522 mov %edx, %eax 523 jae L(shr_10_gobble) 524 525 movdqa 16(%esi), %xmm1 526 movdqa %xmm1, %xmm2 527 palignr $10, (%esi), %xmm1 528 pcmpeqb (%edi), %xmm1 529 530 movdqa 32(%esi), %xmm3 531 palignr $10,%xmm2, %xmm3 532 pcmpeqb 16(%edi), %xmm3 533 534 pand %xmm1, %xmm3 535 pmovmskb %xmm3, %edx 536 lea 32(%edi), %edi 537 lea 32(%esi), %esi 538 sub $0xffff, %edx 539 jnz L(exit) 540 lea (%ecx, %edi,1), %eax 541 lea 10(%ecx, %esi,1), %edx 542 POP (%edi) 543 POP (%esi) 544 jmp L(less48bytes) 545 546 CFI_RESTORE_STATE_AND_DEF_CFA esp, 16 547 CFI_REMEMBER_STATE 548 .p2align 4 549L(shr_10_gobble): 550 sub $32, %ecx 551 movdqa 16(%esi), %xmm0 552 palignr $10, (%esi), %xmm0 553 pcmpeqb (%edi), %xmm0 554 555 movdqa 32(%esi), %xmm3 556 palignr $10, 16(%esi), %xmm3 557 pcmpeqb 16(%edi), %xmm3 558 559L(shr_10_gobble_loop): 560 pand %xmm0, %xmm3 561 sub $32, %ecx 562 pmovmskb %xmm3, %edx 563 movdqa %xmm0, %xmm1 564 565 movdqa 64(%esi), %xmm3 566 palignr $10,48(%esi), %xmm3 567 sbb $0xffff, %edx 568 movdqa 48(%esi), %xmm0 569 palignr $10,32(%esi), %xmm0 570 pcmpeqb 32(%edi), %xmm0 571 lea 32(%esi), %esi 572 pcmpeqb 48(%edi), %xmm3 573 574 lea 32(%edi), %edi 575 jz L(shr_10_gobble_loop) 576 pand %xmm0, %xmm3 577 578 cmp $0, %ecx 579 jge L(shr_10_gobble_next) 580 inc %edx 581 add $32, %ecx 582L(shr_10_gobble_next): 583 test %edx, %edx 584 jnz L(exit) 585 586 pmovmskb %xmm3, %edx 587 movdqa %xmm0, %xmm1 588 lea 32(%edi), %edi 589 lea 32(%esi), %esi 590 sub $0xffff, %edx 591 jnz L(exit) 592 593 lea (%ecx, %edi,1), %eax 594 lea 10(%ecx, %esi,1), %edx 595 POP (%edi) 596 POP (%esi) 597 jmp L(less48bytes) 598 599 CFI_RESTORE_STATE_AND_DEF_CFA esp, 16 600 CFI_REMEMBER_STATE 601 .p2align 4 602L(shr_12): 603 cmp $80, %ecx 604 lea -48(%ecx), %ecx 605 mov %edx, %eax 606 jae L(shr_12_gobble) 607 608 movdqa 16(%esi), %xmm1 609 movdqa %xmm1, %xmm2 610 palignr $12, (%esi), %xmm1 611 pcmpeqb (%edi), %xmm1 612 613 movdqa 32(%esi), %xmm3 614 palignr $12, %xmm2, %xmm3 615 pcmpeqb 16(%edi), %xmm3 616 617 pand %xmm1, %xmm3 618 pmovmskb %xmm3, %edx 619 lea 32(%edi), %edi 620 lea 32(%esi), %esi 621 sub $0xffff, %edx 622 jnz L(exit) 623 lea (%ecx, %edi,1), %eax 624 lea 12(%ecx, %esi,1), %edx 625 POP (%edi) 626 POP (%esi) 627 jmp L(less48bytes) 628 629 CFI_RESTORE_STATE_AND_DEF_CFA esp, 16 630 CFI_REMEMBER_STATE 631 .p2align 4 632L(shr_12_gobble): 633 sub $32, %ecx 634 movdqa 16(%esi), %xmm0 635 palignr $12, (%esi), %xmm0 636 pcmpeqb (%edi), %xmm0 637 638 movdqa 32(%esi), %xmm3 639 palignr $12, 16(%esi), %xmm3 640 pcmpeqb 16(%edi), %xmm3 641 642L(shr_12_gobble_loop): 643 pand %xmm0, %xmm3 644 sub $32, %ecx 645 pmovmskb %xmm3, %edx 646 movdqa %xmm0, %xmm1 647 648 movdqa 64(%esi), %xmm3 649 palignr $12,48(%esi), %xmm3 650 sbb $0xffff, %edx 651 movdqa 48(%esi), %xmm0 652 palignr $12,32(%esi), %xmm0 653 pcmpeqb 32(%edi), %xmm0 654 lea 32(%esi), %esi 655 pcmpeqb 48(%edi), %xmm3 656 657 lea 32(%edi), %edi 658 jz L(shr_12_gobble_loop) 659 pand %xmm0, %xmm3 660 661 cmp $0, %ecx 662 jge L(shr_12_gobble_next) 663 inc %edx 664 add $32, %ecx 665L(shr_12_gobble_next): 666 test %edx, %edx 667 jnz L(exit) 668 669 pmovmskb %xmm3, %edx 670 movdqa %xmm0, %xmm1 671 lea 32(%edi), %edi 672 lea 32(%esi), %esi 673 sub $0xffff, %edx 674 jnz L(exit) 675 676 lea (%ecx, %edi,1), %eax 677 lea 12(%ecx, %esi,1), %edx 678 POP (%edi) 679 POP (%esi) 680 jmp L(less48bytes) 681 682 CFI_RESTORE_STATE_AND_DEF_CFA esp, 16 683 CFI_REMEMBER_STATE 684 .p2align 4 685L(shr_14): 686 cmp $80, %ecx 687 lea -48(%ecx), %ecx 688 mov %edx, %eax 689 jae L(shr_14_gobble) 690 691 movdqa 16(%esi), %xmm1 692 movdqa %xmm1, %xmm2 693 palignr $14, (%esi), %xmm1 694 pcmpeqb (%edi), %xmm1 695 696 movdqa 32(%esi), %xmm3 697 palignr $14, %xmm2, %xmm3 698 pcmpeqb 16(%edi), %xmm3 699 700 pand %xmm1, %xmm3 701 pmovmskb %xmm3, %edx 702 lea 32(%edi), %edi 703 lea 32(%esi), %esi 704 sub $0xffff, %edx 705 jnz L(exit) 706 lea (%ecx, %edi,1), %eax 707 lea 14(%ecx, %esi,1), %edx 708 POP (%edi) 709 POP (%esi) 710 jmp L(less48bytes) 711 712 CFI_RESTORE_STATE_AND_DEF_CFA esp, 16 713 CFI_REMEMBER_STATE 714 .p2align 4 715L(shr_14_gobble): 716 sub $32, %ecx 717 movdqa 16(%esi), %xmm0 718 palignr $14, (%esi), %xmm0 719 pcmpeqb (%edi), %xmm0 720 721 movdqa 32(%esi), %xmm3 722 palignr $14, 16(%esi), %xmm3 723 pcmpeqb 16(%edi), %xmm3 724 725L(shr_14_gobble_loop): 726 pand %xmm0, %xmm3 727 sub $32, %ecx 728 pmovmskb %xmm3, %edx 729 movdqa %xmm0, %xmm1 730 731 movdqa 64(%esi), %xmm3 732 palignr $14,48(%esi), %xmm3 733 sbb $0xffff, %edx 734 movdqa 48(%esi), %xmm0 735 palignr $14,32(%esi), %xmm0 736 pcmpeqb 32(%edi), %xmm0 737 lea 32(%esi), %esi 738 pcmpeqb 48(%edi), %xmm3 739 740 lea 32(%edi), %edi 741 jz L(shr_14_gobble_loop) 742 pand %xmm0, %xmm3 743 744 cmp $0, %ecx 745 jge L(shr_14_gobble_next) 746 inc %edx 747 add $32, %ecx 748L(shr_14_gobble_next): 749 test %edx, %edx 750 jnz L(exit) 751 752 pmovmskb %xmm3, %edx 753 movdqa %xmm0, %xmm1 754 lea 32(%edi), %edi 755 lea 32(%esi), %esi 756 sub $0xffff, %edx 757 jnz L(exit) 758 759 lea (%ecx, %edi,1), %eax 760 lea 14(%ecx, %esi,1), %edx 761 POP (%edi) 762 POP (%esi) 763 jmp L(less48bytes) 764 765 CFI_RESTORE_STATE_AND_DEF_CFA esp, 16 766 CFI_REMEMBER_STATE 767 .p2align 4 768L(exit): 769 pmovmskb %xmm1, %ebx 770 sub $0xffff, %ebx 771 jz L(first16bytes) 772 lea -16(%esi), %esi 773 lea -16(%edi), %edi 774 mov %ebx, %edx 775 776L(first16bytes): 777 add %eax, %esi 778L(less16bytes): 779 test %dl, %dl 780 jz L(next_four_words) 781 test $15, %dl 782 jz L(second_two_words) 783 test $3, %dl 784 jz L(second_word) 785 movzwl -16(%edi), %eax 786 movzwl -16(%esi), %ebx 787 subl %ebx, %eax 788 RETURN 789 790 .p2align 4 791L(second_word): 792 movzwl -14(%edi), %eax 793 movzwl -14(%esi), %ebx 794 subl %ebx, %eax 795 RETURN 796 797 .p2align 4 798L(second_two_words): 799 test $63, %dl 800 jz L(fourth_word) 801 movzwl -12(%edi), %eax 802 movzwl -12(%esi), %ebx 803 subl %ebx, %eax 804 RETURN 805 806 .p2align 4 807L(fourth_word): 808 movzwl -10(%edi), %eax 809 movzwl -10(%esi), %ebx 810 subl %ebx, %eax 811 RETURN 812 813 .p2align 4 814L(next_four_words): 815 test $15, %dh 816 jz L(fourth_two_words) 817 test $3, %dh 818 jz L(sixth_word) 819 movzwl -8(%edi), %eax 820 movzwl -8(%esi), %ebx 821 subl %ebx, %eax 822 RETURN 823 824 .p2align 4 825L(sixth_word): 826 movzwl -6(%edi), %eax 827 movzwl -6(%esi), %ebx 828 subl %ebx, %eax 829 RETURN 830 831 .p2align 4 832L(fourth_two_words): 833 test $63, %dh 834 jz L(eighth_word) 835 movzwl -4(%edi), %eax 836 movzwl -4(%esi), %ebx 837 subl %ebx, %eax 838 RETURN 839 840 .p2align 4 841L(eighth_word): 842 movzwl -2(%edi), %eax 843 movzwl -2(%esi), %ebx 844 subl %ebx, %eax 845 RETURN 846 847 # Unreachable, but needed for static analysis in the check_cfi.py script, 848 # since it does just single forward pass, but the code below is only 849 # reachable via a backward branch. 850 CFI_DEF_CFA (esp, 4) 851 PUSH (%ebx) 852 853 .p2align 4 854L(more8bytes): 855 cmp $16, %ecx 856 jae L(more16bytes) 857 cmp $8, %ecx 858 je L(8bytes) 859 cmp $10, %ecx 860 je L(10bytes) 861 cmp $12, %ecx 862 je L(12bytes) 863 jmp L(14bytes) 864 865 .p2align 4 866L(more16bytes): 867 cmp $24, %ecx 868 jae L(more24bytes) 869 cmp $16, %ecx 870 je L(16bytes) 871 cmp $18, %ecx 872 je L(18bytes) 873 cmp $20, %ecx 874 je L(20bytes) 875 jmp L(22bytes) 876 877 .p2align 4 878L(more24bytes): 879 cmp $32, %ecx 880 jae L(more32bytes) 881 cmp $24, %ecx 882 je L(24bytes) 883 cmp $26, %ecx 884 je L(26bytes) 885 cmp $28, %ecx 886 je L(28bytes) 887 jmp L(30bytes) 888 889 .p2align 4 890L(more32bytes): 891 cmp $40, %ecx 892 jae L(more40bytes) 893 cmp $32, %ecx 894 je L(32bytes) 895 cmp $34, %ecx 896 je L(34bytes) 897 cmp $36, %ecx 898 je L(36bytes) 899 jmp L(38bytes) 900 901 .p2align 4 902L(less48bytes): 903 cmp $8, %ecx 904 jae L(more8bytes) 905 cmp $2, %ecx 906 je L(2bytes) 907 cmp $4, %ecx 908 je L(4bytes) 909 jmp L(6bytes) 910 911 .p2align 4 912L(more40bytes): 913 cmp $40, %ecx 914 je L(40bytes) 915 cmp $42, %ecx 916 je L(42bytes) 917 cmp $44, %ecx 918 je L(44bytes) 919 jmp L(46bytes) 920 921 .p2align 4 922L(46bytes): 923 movzwl -46(%eax), %ecx 924 movzwl -46(%edx), %ebx 925 subl %ebx, %ecx 926 jne L(memcmp16_exit) 927L(44bytes): 928 movzwl -44(%eax), %ecx 929 movzwl -44(%edx), %ebx 930 subl %ebx, %ecx 931 jne L(memcmp16_exit) 932L(42bytes): 933 movzwl -42(%eax), %ecx 934 movzwl -42(%edx), %ebx 935 subl %ebx, %ecx 936 jne L(memcmp16_exit) 937L(40bytes): 938 movzwl -40(%eax), %ecx 939 movzwl -40(%edx), %ebx 940 subl %ebx, %ecx 941 jne L(memcmp16_exit) 942L(38bytes): 943 movzwl -38(%eax), %ecx 944 movzwl -38(%edx), %ebx 945 subl %ebx, %ecx 946 jne L(memcmp16_exit) 947L(36bytes): 948 movzwl -36(%eax), %ecx 949 movzwl -36(%edx), %ebx 950 subl %ebx, %ecx 951 jne L(memcmp16_exit) 952L(34bytes): 953 movzwl -34(%eax), %ecx 954 movzwl -34(%edx), %ebx 955 subl %ebx, %ecx 956 jne L(memcmp16_exit) 957L(32bytes): 958 movzwl -32(%eax), %ecx 959 movzwl -32(%edx), %ebx 960 subl %ebx, %ecx 961 jne L(memcmp16_exit) 962L(30bytes): 963 movzwl -30(%eax), %ecx 964 movzwl -30(%edx), %ebx 965 subl %ebx, %ecx 966 jne L(memcmp16_exit) 967L(28bytes): 968 movzwl -28(%eax), %ecx 969 movzwl -28(%edx), %ebx 970 subl %ebx, %ecx 971 jne L(memcmp16_exit) 972L(26bytes): 973 movzwl -26(%eax), %ecx 974 movzwl -26(%edx), %ebx 975 subl %ebx, %ecx 976 jne L(memcmp16_exit) 977L(24bytes): 978 movzwl -24(%eax), %ecx 979 movzwl -24(%edx), %ebx 980 subl %ebx, %ecx 981 jne L(memcmp16_exit) 982L(22bytes): 983 movzwl -22(%eax), %ecx 984 movzwl -22(%edx), %ebx 985 subl %ebx, %ecx 986 jne L(memcmp16_exit) 987L(20bytes): 988 movzwl -20(%eax), %ecx 989 movzwl -20(%edx), %ebx 990 subl %ebx, %ecx 991 jne L(memcmp16_exit) 992L(18bytes): 993 movzwl -18(%eax), %ecx 994 movzwl -18(%edx), %ebx 995 subl %ebx, %ecx 996 jne L(memcmp16_exit) 997L(16bytes): 998 movzwl -16(%eax), %ecx 999 movzwl -16(%edx), %ebx 1000 subl %ebx, %ecx 1001 jne L(memcmp16_exit) 1002L(14bytes): 1003 movzwl -14(%eax), %ecx 1004 movzwl -14(%edx), %ebx 1005 subl %ebx, %ecx 1006 jne L(memcmp16_exit) 1007L(12bytes): 1008 movzwl -12(%eax), %ecx 1009 movzwl -12(%edx), %ebx 1010 subl %ebx, %ecx 1011 jne L(memcmp16_exit) 1012L(10bytes): 1013 movzwl -10(%eax), %ecx 1014 movzwl -10(%edx), %ebx 1015 subl %ebx, %ecx 1016 jne L(memcmp16_exit) 1017L(8bytes): 1018 movzwl -8(%eax), %ecx 1019 movzwl -8(%edx), %ebx 1020 subl %ebx, %ecx 1021 jne L(memcmp16_exit) 1022L(6bytes): 1023 movzwl -6(%eax), %ecx 1024 movzwl -6(%edx), %ebx 1025 subl %ebx, %ecx 1026 jne L(memcmp16_exit) 1027L(4bytes): 1028 movzwl -4(%eax), %ecx 1029 movzwl -4(%edx), %ebx 1030 subl %ebx, %ecx 1031 jne L(memcmp16_exit) 1032L(2bytes): 1033 movzwl -2(%eax), %eax 1034 movzwl -2(%edx), %ebx 1035 subl %ebx, %eax 1036 POP (%ebx) 1037 ret 1038 CFI_PUSH (%ebx) 1039 1040 .p2align 4 1041L(memcmp16_exit): 1042 POP (%ebx) 1043 mov %ecx, %eax 1044 ret 1045END_FUNCTION MEMCMP 1046