1/* 2Copyright (c) 2014, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31/******************************************************************************/ 32// ALGORITHM DESCRIPTION 33// --------------------- 34// 35// Description: 36// Let K = 64 (table size). 37// 38// Four sub-domains: 39// 1. |x| < 1/(2*K) 40// expm1(x) ~ P(x) 41// 2. 1/(2*K) <= |x| <= 56*log(2) 42// x x/log(2) n 43// e - 1 = 2 = 2 * T[j] * (1 + P(y)) - 1 44// 3. 56*log(2) < x < MAX_LOG 45// x x x/log(2) n 46// e - 1 ~ e = 2 = 2 * T[j] * (1 + P(y)) 47// 4. x < -56*log(2) 48// x x 49// e - 1 = -1 + e ~ -1 50// where 51// x = m*log(2)/K + y, y in [-log(2)/K..log(2)/K] 52// m = n*K + j, m,n,j - signed integer, j in [-K/2..K/2] 53// j/K 54// values of 2 are tabulated as T[j] = T_hi[j] ( 1 + T_lo[j]). 55// 56// P(y) is a minimax polynomial approximation of exp(x)-1 57// on small interval [-log(2)/K..log(2)/K] (were calculated by Maple V). 58// 59// In case 3, to avoid problems with arithmetic overflow and underflow, 60// n n1 n2 61// value of 2 is safely computed as 2 * 2 where n1 in [-BIAS/2..BIAS/2] 62// and BIAS is a value of exponent bias. 63// 64// Special cases: 65// expm1(NaN) is NaN 66// expm1(+INF) is +INF 67// expm1(-INF) is -1 68// expm1(x) is x for subnormals 69// for finite argument, only expm1(0)=0 is exact. 70// For IEEE double 71// if x > 709.782712893383973096 then expm1(x) overflow 72// 73/******************************************************************************/ 74 75#include <private/bionic_asm.h> 76# -- Begin expm1 77ENTRY(expm1) 78# parameter 1: %xmm0 79..B1.1: 80..___tag_value_expm1.1: 81 subq $56, %rsp 82..___tag_value_expm1.3: 83 movsd %xmm0, 32(%rsp) 84..B1.2: 85 unpcklpd %xmm0, %xmm0 86 movapd cv(%rip), %xmm1 87 movapd Shifter(%rip), %xmm6 88 movapd 16+cv(%rip), %xmm2 89 movapd 32+cv(%rip), %xmm3 90 pextrw $3, %xmm0, %eax 91 andl $32767, %eax 92 movl $16527, %edx 93 subl %eax, %edx 94 subl $16304, %eax 95 orl %eax, %edx 96 cmpl $-2147483648, %edx 97 jae .L_2TAG_PACKET_0.0.2 98 mulpd %xmm0, %xmm1 99 addpd %xmm6, %xmm1 100 movapd %xmm1, %xmm7 101 subpd %xmm6, %xmm1 102 mulpd %xmm1, %xmm2 103 movapd 48+cv(%rip), %xmm4 104 mulpd %xmm1, %xmm3 105 movapd 64+cv(%rip), %xmm5 106 subpd %xmm2, %xmm0 107 movd %xmm7, %eax 108 movl %eax, %ecx 109 andl $63, %ecx 110 shll $4, %ecx 111 sarl $6, %eax 112 movl %eax, %edx 113 subpd %xmm3, %xmm0 114 lea Tbl_addr(%rip), %r11 115 movapd (%rcx,%r11), %xmm2 116 movq 80+cv(%rip), %xmm3 117 mulpd %xmm0, %xmm4 118 movapd %xmm0, %xmm1 119 mulpd %xmm0, %xmm0 120 mulsd %xmm0, %xmm3 121 addpd %xmm4, %xmm5 122 mulsd %xmm0, %xmm0 123 movq %xmm2, %xmm4 124 unpckhpd %xmm2, %xmm2 125 movdqa mmask(%rip), %xmm6 126 pand %xmm6, %xmm7 127 movdqa bias(%rip), %xmm6 128 paddq %xmm6, %xmm7 129 psllq $46, %xmm7 130 mulsd %xmm0, %xmm3 131 mulpd %xmm5, %xmm0 132 addl $894, %edx 133 cmpl $1916, %edx 134 ja .L_2TAG_PACKET_1.0.2 135 addsd %xmm3, %xmm0 136 xorpd %xmm3, %xmm3 137 movl $16368, %eax 138 pinsrw $3, %eax, %xmm3 139 orpd %xmm7, %xmm2 140 mulsd %xmm4, %xmm7 141 movq %xmm3, %xmm6 142 addsd %xmm1, %xmm3 143 pextrw $3, %xmm2, %edx 144 pshufd $238, %xmm0, %xmm5 145 psrlq $38, %xmm3 146 psllq $38, %xmm3 147 movq %xmm2, %xmm4 148 subsd %xmm3, %xmm6 149 addsd %xmm5, %xmm0 150 addsd %xmm6, %xmm1 151 addsd %xmm7, %xmm4 152 mulsd %xmm3, %xmm7 153 mulsd %xmm2, %xmm3 154 xorpd %xmm5, %xmm5 155 movl $16368, %eax 156 pinsrw $3, %eax, %xmm5 157 addsd %xmm1, %xmm0 158 movl $17184, %ecx 159 subl %edx, %ecx 160 subl $16256, %edx 161 orl %edx, %ecx 162 jl .L_2TAG_PACKET_2.0.2 163 mulsd %xmm4, %xmm0 164 subsd %xmm5, %xmm3 165 addsd %xmm7, %xmm0 166 addsd %xmm3, %xmm0 167.L_2TAG_PACKET_3.0.2: 168 jmp ..B1.5 169.L_2TAG_PACKET_2.0.2: 170 cmpl $0, %edx 171 jl .L_2TAG_PACKET_4.0.2 172 mulsd %xmm4, %xmm0 173 subsd %xmm5, %xmm7 174 addsd %xmm7, %xmm0 175 addsd %xmm3, %xmm0 176 jmp ..B1.5 177.L_2TAG_PACKET_4.0.2: 178 mulsd %xmm4, %xmm0 179 addsd %xmm7, %xmm0 180 addsd %xmm3, %xmm0 181 subsd %xmm5, %xmm0 182 jmp ..B1.5 183.L_2TAG_PACKET_1.0.2: 184 movl 36(%rsp), %ecx 185 addsd %xmm0, %xmm1 186 unpckhpd %xmm0, %xmm0 187 addsd %xmm1, %xmm0 188 cmpl $0, %ecx 189 jl .L_2TAG_PACKET_5.0.2 190 fstcw (%rsp) 191 movw (%rsp), %dx 192 orw $768, %dx 193 movw %dx, 4(%rsp) 194 fldcw 4(%rsp) 195 movl %eax, %edx 196 sarl $1, %eax 197 subl %eax, %edx 198 movdqa emask(%rip), %xmm6 199 pandn %xmm2, %xmm6 200 addl $1023, %eax 201 movd %eax, %xmm3 202 psllq $52, %xmm3 203 orpd %xmm3, %xmm6 204 mulsd %xmm3, %xmm4 205 movsd %xmm0, 16(%rsp) 206 fldl 16(%rsp) 207 movsd %xmm6, 24(%rsp) 208 fldl 24(%rsp) 209 movsd %xmm4, 16(%rsp) 210 fldl 16(%rsp) 211 addl $1023, %edx 212 movd %edx, %xmm4 213 psllq $52, %xmm4 214 faddp %st, %st(1) 215 fmul %st, %st(1) 216 faddp %st, %st(1) 217 movsd %xmm4, 24(%rsp) 218 fldl 24(%rsp) 219 fmulp %st, %st(1) 220 fstpl 16(%rsp) 221 movsd 16(%rsp), %xmm0 222 fldcw (%rsp) 223 pextrw $3, %xmm0, %ecx 224 andl $32752, %ecx 225 cmpl $32752, %ecx 226 jae .L_2TAG_PACKET_6.0.2 227 jmp ..B1.5 228 cmpl $-2147483648, %ecx 229 jb .L_2TAG_PACKET_6.0.2 230 jmp ..B1.5 231.L_2TAG_PACKET_6.0.2: 232 movl $41, 8(%rsp) 233 jmp .L_2TAG_PACKET_7.0.2 234.L_2TAG_PACKET_8.0.2: 235 cmpl $2146435072, %eax 236 jae .L_2TAG_PACKET_9.0.2 237 movsd XMAX(%rip), %xmm0 238 mulsd %xmm0, %xmm0 239 movl $41, 8(%rsp) 240 jmp .L_2TAG_PACKET_7.0.2 241.L_2TAG_PACKET_9.0.2: 242 movl 36(%rsp), %eax 243 movl 32(%rsp), %edx 244 movl %eax, %ecx 245 andl $2147483647, %eax 246 cmpl $2146435072, %eax 247 ja .L_2TAG_PACKET_10.0.2 248 cmpl $0, %edx 249 jne .L_2TAG_PACKET_10.0.2 250 cmpl $0, %ecx 251 jl .L_2TAG_PACKET_11.0.2 252 movq INF(%rip), %xmm0 253 jmp ..B1.5 254.L_2TAG_PACKET_11.0.2: 255 jmp .L_2TAG_PACKET_5.0.2 256.L_2TAG_PACKET_10.0.2: 257 movsd 32(%rsp), %xmm0 258 addsd %xmm0, %xmm0 259 jmp ..B1.5 260.L_2TAG_PACKET_12.0.2: 261 addl $16304, %eax 262 cmpl $15504, %eax 263 jb .L_2TAG_PACKET_13.0.2 264 movapd cvl(%rip), %xmm2 265 pshufd $68, %xmm0, %xmm1 266 movapd 16+cvl(%rip), %xmm3 267 movapd 32+cvl(%rip), %xmm4 268 movq 48+cvl(%rip), %xmm5 269 mulsd %xmm1, %xmm1 270 xorpd %xmm6, %xmm6 271 movl $16352, %eax 272 pinsrw $3, %eax, %xmm6 273 mulpd %xmm0, %xmm2 274 xorpd %xmm7, %xmm7 275 movl $16368, %edx 276 pinsrw $3, %edx, %xmm7 277 addpd %xmm3, %xmm2 278 mulsd %xmm1, %xmm5 279 pshufd $228, %xmm1, %xmm3 280 mulpd %xmm1, %xmm1 281 mulsd %xmm0, %xmm6 282 mulpd %xmm0, %xmm2 283 addpd %xmm4, %xmm2 284 movq %xmm7, %xmm4 285 addsd %xmm6, %xmm7 286 mulpd %xmm3, %xmm1 287 psrlq $27, %xmm7 288 psllq $27, %xmm7 289 movq HIGHMASK(%rip), %xmm3 290 subsd %xmm7, %xmm4 291 mulpd %xmm1, %xmm2 292 addsd %xmm4, %xmm6 293 pshufd $238, %xmm2, %xmm1 294 addsd %xmm2, %xmm6 295 andpd %xmm0, %xmm3 296 movq %xmm0, %xmm4 297 addsd %xmm6, %xmm1 298 subsd %xmm3, %xmm0 299 addsd %xmm5, %xmm1 300 mulsd %xmm7, %xmm3 301 mulsd %xmm7, %xmm0 302 mulsd %xmm1, %xmm4 303 addsd %xmm4, %xmm0 304 addsd %xmm3, %xmm0 305 jmp ..B1.5 306.L_2TAG_PACKET_13.0.2: 307 cmpl $16, %eax 308 jae .L_2TAG_PACKET_3.0.2 309 movq %xmm0, %xmm2 310 movd %xmm0, %eax 311 psrlq $31, %xmm2 312 movd %xmm2, %ecx 313 orl %ecx, %eax 314 je .L_2TAG_PACKET_3.0.2 315 movl $16, %edx 316 xorpd %xmm1, %xmm1 317 pinsrw $3, %edx, %xmm1 318 mulsd %xmm1, %xmm1 319 movl $42, 8(%rsp) 320 jmp .L_2TAG_PACKET_7.0.2 321.L_2TAG_PACKET_0.0.2: 322 cmpl $0, %eax 323 jl .L_2TAG_PACKET_12.0.2 324 movl 36(%rsp), %eax 325 cmpl $1083179008, %eax 326 jge .L_2TAG_PACKET_8.0.2 327 cmpl $-1048576, %eax 328 jae .L_2TAG_PACKET_9.0.2 329.L_2TAG_PACKET_5.0.2: 330 xorpd %xmm0, %xmm0 331 movl $49136, %eax 332 pinsrw $3, %eax, %xmm0 333 jmp ..B1.5 334.L_2TAG_PACKET_7.0.2: 335 movq %xmm0, 40(%rsp) 336..B1.3: 337 movq 40(%rsp), %xmm0 338.L_2TAG_PACKET_14.0.2: 339..B1.5: 340 addq $56, %rsp 341..___tag_value_expm1.4: 342 ret 343..___tag_value_expm1.5: 344END(expm1) 345# -- End expm1 346 .section .rodata, "a" 347 .align 16 348 .align 16 349cv: 350 .long 1697350398 351 .long 1079448903 352 .long 1697350398 353 .long 1079448903 354 .long 4277796864 355 .long 1065758274 356 .long 4277796864 357 .long 1065758274 358 .long 3164486458 359 .long 1025308570 360 .long 3164486458 361 .long 1025308570 362 .long 1963358694 363 .long 1065423121 364 .long 1431655765 365 .long 1069897045 366 .long 1431655765 367 .long 1067799893 368 .long 0 369 .long 1071644672 370 .long 381774871 371 .long 1062650220 372 .long 381774871 373 .long 1062650220 374 .type cv,@object 375 .size cv,96 376 .align 16 377Shifter: 378 .long 0 379 .long 1127743488 380 .long 0 381 .long 1127743488 382 .type Shifter,@object 383 .size Shifter,16 384 .align 16 385Tbl_addr: 386 .long 0 387 .long 0 388 .long 0 389 .long 0 390 .long 1000070955 391 .long 1042145304 392 .long 1040187392 393 .long 11418 394 .long 988267849 395 .long 1039500660 396 .long 3539992576 397 .long 22960 398 .long 36755401 399 .long 1042114290 400 .long 402653184 401 .long 34629 402 .long 3634769483 403 .long 1042178627 404 .long 1820327936 405 .long 46424 406 .long 2155991225 407 .long 1041560680 408 .long 847249408 409 .long 58348 410 .long 2766913307 411 .long 1039293264 412 .long 3489660928 413 .long 70401 414 .long 3651174602 415 .long 1040488175 416 .long 2927624192 417 .long 82586 418 .long 3073892131 419 .long 1042240606 420 .long 1006632960 421 .long 94904 422 .long 1328391742 423 .long 1042019037 424 .long 3942645760 425 .long 107355 426 .long 2650893825 427 .long 1041903210 428 .long 822083584 429 .long 119943 430 .long 2397289153 431 .long 1041802037 432 .long 2281701376 433 .long 132667 434 .long 430997175 435 .long 1042110606 436 .long 1845493760 437 .long 145530 438 .long 1230936525 439 .long 1041801015 440 .long 1702887424 441 .long 158533 442 .long 740675935 443 .long 1040178913 444 .long 4110417920 445 .long 171677 446 .long 3489810261 447 .long 1041825986 448 .long 2793406464 449 .long 184965 450 .long 2532600530 451 .long 1040767882 452 .long 167772160 453 .long 198398 454 .long 3542557060 455 .long 1041827263 456 .long 2986344448 457 .long 211976 458 .long 1401563777 459 .long 1041061093 460 .long 922746880 461 .long 225703 462 .long 3129406026 463 .long 1041852413 464 .long 880803840 465 .long 239579 466 .long 900993572 467 .long 1039283234 468 .long 1275068416 469 .long 253606 470 .long 2115029358 471 .long 1042140042 472 .long 562036736 473 .long 267786 474 .long 1086643152 475 .long 1041785419 476 .long 1610612736 477 .long 282120 478 .long 82864366 479 .long 1041256244 480 .long 3045064704 481 .long 296610 482 .long 2392968152 483 .long 1040913683 484 .long 3573547008 485 .long 311258 486 .long 2905856183 487 .long 1040002214 488 .long 1988100096 489 .long 326066 490 .long 3742008261 491 .long 1040011137 492 .long 1451229184 493 .long 341035 494 .long 863393794 495 .long 1040880621 496 .long 914358272 497 .long 356167 498 .long 1446136837 499 .long 1041372426 500 .long 3707764736 501 .long 371463 502 .long 927855201 503 .long 1040617636 504 .long 360710144 505 .long 386927 506 .long 1492679939 507 .long 1041050306 508 .long 2952790016 509 .long 402558 510 .long 608827001 511 .long 1041582217 512 .long 2181038080 513 .long 418360 514 .long 606260204 515 .long 1042271987 516 .long 1711276032 517 .long 434334 518 .long 3163044019 519 .long 1041843851 520 .long 1006632960 521 .long 450482 522 .long 4148747325 523 .long 1041962972 524 .long 3900702720 525 .long 466805 526 .long 802924201 527 .long 1041275378 528 .long 1442840576 529 .long 483307 530 .long 3052749833 531 .long 1041940577 532 .long 1937768448 533 .long 499988 534 .long 2216116399 535 .long 1041486744 536 .long 914358272 537 .long 516851 538 .long 2729697836 539 .long 1041445764 540 .long 2566914048 541 .long 533897 542 .long 540608356 543 .long 1041310907 544 .long 2600468480 545 .long 551129 546 .long 2916344493 547 .long 1040535661 548 .long 1107296256 549 .long 568549 550 .long 731391814 551 .long 1039497014 552 .long 2566914048 553 .long 586158 554 .long 1024722704 555 .long 1041461625 556 .long 2961178624 557 .long 603959 558 .long 3806831748 559 .long 1041732499 560 .long 2675965952 561 .long 621954 562 .long 238953304 563 .long 1040316488 564 .long 2189426688 565 .long 640145 566 .long 749123235 567 .long 1041725785 568 .long 2063597568 569 .long 658534 570 .long 1168187977 571 .long 1041175214 572 .long 2986344448 573 .long 677123 574 .long 3506096399 575 .long 1042186095 576 .long 1426063360 577 .long 695915 578 .long 1470221620 579 .long 1041675499 580 .long 2566914048 581 .long 714911 582 .long 3182425146 583 .long 1041483134 584 .long 3087007744 585 .long 734114 586 .long 3131698208 587 .long 1042208657 588 .long 4068474880 589 .long 753526 590 .long 2300504125 591 .long 1041428596 592 .long 2415919104 593 .long 773150 594 .long 2290297931 595 .long 1037388400 596 .long 3716153344 597 .long 792987 598 .long 3532148223 599 .long 1041626194 600 .long 771751936 601 .long 813041 602 .long 1161884404 603 .long 1042015258 604 .long 3699376128 605 .long 833312 606 .long 876383176 607 .long 1037968878 608 .long 1241513984 609 .long 853805 610 .long 3379986796 611 .long 1042213153 612 .long 3699376128 613 .long 874520 614 .long 1545797737 615 .long 1041681569 616 .long 58720256 617 .long 895462 618 .long 2925146801 619 .long 1042212567 620 .long 855638016 621 .long 916631 622 .long 1316627971 623 .long 1038516204 624 .long 3883925504 625 .long 938030 626 .long 3267869137 627 .long 1040337004 628 .long 2726297600 629 .long 959663 630 .long 3720868999 631 .long 1041782409 632 .long 3992977408 633 .long 981531 634 .long 433316142 635 .long 1041994064 636 .long 1526726656 637 .long 1003638 638 .long 781232103 639 .long 1040093400 640 .long 2172649472 641 .long 1025985 642 .type Tbl_addr,@object 643 .size Tbl_addr,1024 644 .align 16 645mmask: 646 .long 4294967232 647 .long 0 648 .long 4294967232 649 .long 0 650 .type mmask,@object 651 .size mmask,16 652 .align 16 653bias: 654 .long 65472 655 .long 0 656 .long 65472 657 .long 0 658 .type bias,@object 659 .size bias,16 660 .align 16 661emask: 662 .long 0 663 .long 4293918720 664 .long 0 665 .long 4293918720 666 .type emask,@object 667 .size emask,16 668 .align 16 669cvl: 670 .long 2773927732 671 .long 1053236707 672 .long 381774871 673 .long 1062650220 674 .long 379653899 675 .long 1056571845 676 .long 286331153 677 .long 1065423121 678 .long 436314138 679 .long 1059717536 680 .long 1431655765 681 .long 1067799893 682 .long 1431655765 683 .long 1069897045 684 .long 0 685 .long 1071644672 686 .type cvl,@object 687 .size cvl,64 688 .align 8 689XMAX: 690 .long 4294967295 691 .long 2146435071 692 .type XMAX,@object 693 .size XMAX,8 694 .align 8 695INF: 696 .long 0 697 .long 2146435072 698 .type INF,@object 699 .size INF,8 700 .align 8 701HIGHMASK: 702 .long 4227858432 703 .long 4294967295 704 .type HIGHMASK,@object 705 .size HIGHMASK,8 706 .data 707 .section .note.GNU-stack, "" 708// -- Begin DWARF2 SEGMENT .eh_frame 709 .section .eh_frame,"a",@progbits 710.eh_frame_seg: 711 .align 1 712 .4byte 0x00000014 713 .8byte 0x00527a0100000000 714 .8byte 0x08070c1b01107801 715 .4byte 0x00000190 716 .4byte 0x0000001c 717 .4byte 0x0000001c 718 .4byte ..___tag_value_expm1.1-. 719 .4byte ..___tag_value_expm1.5-..___tag_value_expm1.1 720 .2byte 0x0400 721 .4byte ..___tag_value_expm1.3-..___tag_value_expm1.1 722 .2byte 0x400e 723 .byte 0x04 724 .4byte ..___tag_value_expm1.4-..___tag_value_expm1.3 725 .2byte 0x080e 726 .byte 0x00 727# End 728