1/* 2 * (C) Copyright IBM Corporation 2004 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25/** 26 * \file read_rgba_span_x86.S 27 * Optimized routines to transfer pixel data from the framebuffer to a 28 * buffer in main memory. 29 * 30 * \author Ian Romanick <idr@us.ibm.com> 31 */ 32 33 .file "read_rgba_span_x86.S" 34#if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */ 35/* Kevin F. Quinn 2nd July 2006 36 * Replaced data segment constants with text-segment instructions. 37 */ 38#define LOAD_MASK(mvins,m1,m2) \ 39 pushl $0xff00ff00 ;\ 40 pushl $0xff00ff00 ;\ 41 pushl $0xff00ff00 ;\ 42 pushl $0xff00ff00 ;\ 43 mvins (%esp), m1 ;\ 44 pushl $0x00ff0000 ;\ 45 pushl $0x00ff0000 ;\ 46 pushl $0x00ff0000 ;\ 47 pushl $0x00ff0000 ;\ 48 mvins (%esp), m2 ;\ 49 addl $32, %esp 50 51/* I implemented these as macros because they appear in several places, 52 * and I've tweaked them a number of times. I got tired of changing every 53 * place they appear. :) 54 */ 55 56#define DO_ONE_PIXEL() \ 57 movl (%ebx), %eax ; \ 58 addl $4, %ebx ; \ 59 bswap %eax /* ARGB -> BGRA */ ; \ 60 rorl $8, %eax /* BGRA -> ABGR */ ; \ 61 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \ 62 addl $4, %ecx 63 64#define DO_ONE_LAST_PIXEL() \ 65 movl (%ebx), %eax ; \ 66 bswap %eax /* ARGB -> BGRA */ ; \ 67 rorl $8, %eax /* BGRA -> ABGR */ ; \ 68 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \ 69 70 71/** 72 * MMX optimized version of the BGRA8888_REV to RGBA copy routine. 73 * 74 * \warning 75 * This function assumes that the caller will issue the EMMS instruction 76 * at the correct places. 77 */ 78 79.globl _generic_read_RGBA_span_BGRA8888_REV_MMX 80#ifndef USE_DRICORE 81.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX 82#endif 83 .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function 84_generic_read_RGBA_span_BGRA8888_REV_MMX: 85 pushl %ebx 86 87#ifdef USE_INNER_EMMS 88 emms 89#endif 90 LOAD_MASK(movq,%mm1,%mm2) 91 92 movl 8(%esp), %ebx /* source pointer */ 93 movl 16(%esp), %edx /* number of pixels to copy */ 94 movl 12(%esp), %ecx /* destination pointer */ 95 96 testl %edx, %edx 97 jle .L20 /* Bail if there's nothing to do. */ 98 99 movl %ebx, %eax 100 101 negl %eax 102 sarl $2, %eax 103 andl $1, %eax 104 je .L17 105 106 subl %eax, %edx 107 DO_ONE_PIXEL() 108.L17: 109 110 /* Would it be faster to unroll this loop once and process 4 pixels 111 * per pass, instead of just two? 112 */ 113 114 movl %edx, %eax 115 shrl %eax 116 jmp .L18 117.L19: 118 movq (%ebx), %mm0 119 addl $8, %ebx 120 121 /* These 9 instructions do what PSHUFB (if there were such an 122 * instruction) could do in 1. :( 123 */ 124 125 movq %mm0, %mm3 126 movq %mm0, %mm4 127 128 pand %mm2, %mm3 129 psllq $16, %mm4 130 psrlq $16, %mm3 131 pand %mm2, %mm4 132 133 pand %mm1, %mm0 134 por %mm4, %mm3 135 por %mm3, %mm0 136 137 movq %mm0, (%ecx) 138 addl $8, %ecx 139 subl $1, %eax 140.L18: 141 jne .L19 142 143#ifdef USE_INNER_EMMS 144 emms 145#endif 146 147 /* At this point there are either 1 or 0 pixels remaining to be 148 * converted. Convert the last pixel, if needed. 149 */ 150 151 testl $1, %edx 152 je .L20 153 154 DO_ONE_LAST_PIXEL() 155 156.L20: 157 popl %ebx 158 ret 159 .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX 160 161 162/** 163 * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE 164 * instructions are only actually used to read data from the framebuffer. 165 * In practice, the speed-up is pretty small. 166 * 167 * \todo 168 * Do some more testing and determine if there's any reason to have this 169 * function in addition to the MMX version. 170 * 171 * \warning 172 * This function assumes that the caller will issue the EMMS instruction 173 * at the correct places. 174 */ 175 176.globl _generic_read_RGBA_span_BGRA8888_REV_SSE 177#ifndef USE_DRICORE 178.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE 179#endif 180 .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function 181_generic_read_RGBA_span_BGRA8888_REV_SSE: 182 pushl %esi 183 pushl %ebx 184 pushl %ebp 185 186#ifdef USE_INNER_EMMS 187 emms 188#endif 189 190 LOAD_MASK(movq,%mm1,%mm2) 191 192 movl 16(%esp), %ebx /* source pointer */ 193 movl 24(%esp), %edx /* number of pixels to copy */ 194 movl 20(%esp), %ecx /* destination pointer */ 195 196 testl %edx, %edx 197 jle .L35 /* Bail if there's nothing to do. */ 198 199 movl %esp, %ebp 200 subl $16, %esp 201 andl $0xfffffff0, %esp 202 203 movl %ebx, %eax 204 movl %edx, %esi 205 206 negl %eax 207 andl $15, %eax 208 sarl $2, %eax 209 cmpl %edx, %eax 210 cmovle %eax, %esi 211 212 subl %esi, %edx 213 214 testl $1, %esi 215 je .L32 216 217 DO_ONE_PIXEL() 218.L32: 219 220 testl $2, %esi 221 je .L31 222 223 movq (%ebx), %mm0 224 addl $8, %ebx 225 226 movq %mm0, %mm3 227 movq %mm0, %mm4 228 229 pand %mm2, %mm3 230 psllq $16, %mm4 231 psrlq $16, %mm3 232 pand %mm2, %mm4 233 234 pand %mm1, %mm0 235 por %mm4, %mm3 236 por %mm3, %mm0 237 238 movq %mm0, (%ecx) 239 addl $8, %ecx 240.L31: 241 242 movl %edx, %eax 243 shrl $2, %eax 244 jmp .L33 245.L34: 246 movaps (%ebx), %xmm0 247 addl $16, %ebx 248 249 /* This would be so much better if we could just move directly from 250 * an SSE register to an MMX register. Unfortunately, that 251 * functionality wasn't introduced until SSE2 with the MOVDQ2Q 252 * instruction. 253 */ 254 255 movaps %xmm0, (%esp) 256 movq (%esp), %mm0 257 movq 8(%esp), %mm5 258 259 movq %mm0, %mm3 260 movq %mm0, %mm4 261 movq %mm5, %mm6 262 movq %mm5, %mm7 263 264 pand %mm2, %mm3 265 pand %mm2, %mm6 266 267 psllq $16, %mm4 268 psllq $16, %mm7 269 270 psrlq $16, %mm3 271 psrlq $16, %mm6 272 273 pand %mm2, %mm4 274 pand %mm2, %mm7 275 276 pand %mm1, %mm0 277 pand %mm1, %mm5 278 279 por %mm4, %mm3 280 por %mm7, %mm6 281 282 por %mm3, %mm0 283 por %mm6, %mm5 284 285 movq %mm0, (%ecx) 286 movq %mm5, 8(%ecx) 287 addl $16, %ecx 288 289 subl $1, %eax 290.L33: 291 jne .L34 292 293#ifdef USE_INNER_EMMS 294 emms 295#endif 296 movl %ebp, %esp 297 298 /* At this point there are either [0, 3] pixels remaining to be 299 * converted. 300 */ 301 302 testl $2, %edx 303 je .L36 304 305 movq (%ebx), %mm0 306 addl $8, %ebx 307 308 movq %mm0, %mm3 309 movq %mm0, %mm4 310 311 pand %mm2, %mm3 312 psllq $16, %mm4 313 psrlq $16, %mm3 314 pand %mm2, %mm4 315 316 pand %mm1, %mm0 317 por %mm4, %mm3 318 por %mm3, %mm0 319 320 movq %mm0, (%ecx) 321 addl $8, %ecx 322.L36: 323 324 testl $1, %edx 325 je .L35 326 327 DO_ONE_LAST_PIXEL() 328.L35: 329 popl %ebp 330 popl %ebx 331 popl %esi 332 ret 333 .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE 334 335 336/** 337 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine. 338 */ 339 340 .text 341.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2 342#ifndef USE_DRICORE 343.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2 344#endif 345 .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function 346_generic_read_RGBA_span_BGRA8888_REV_SSE2: 347 pushl %esi 348 pushl %ebx 349 350 LOAD_MASK(movdqu,%xmm1,%xmm2) 351 352 movl 12(%esp), %ebx /* source pointer */ 353 movl 20(%esp), %edx /* number of pixels to copy */ 354 movl 16(%esp), %ecx /* destination pointer */ 355 356 movl %ebx, %eax 357 movl %edx, %esi 358 359 testl %edx, %edx 360 jle .L46 /* Bail if there's nothing to do. */ 361 362 /* If the source pointer isn't a multiple of 16 we have to process 363 * a few pixels the "slow" way to get the address aligned for 364 * the SSE fetch intsructions. 365 */ 366 367 negl %eax 368 andl $15, %eax 369 sarl $2, %eax 370 371 cmpl %edx, %eax 372 cmovbe %eax, %esi 373 subl %esi, %edx 374 375 testl $1, %esi 376 je .L41 377 378 DO_ONE_PIXEL() 379.L41: 380 testl $2, %esi 381 je .L40 382 383 movq (%ebx), %xmm0 384 addl $8, %ebx 385 386 movdqa %xmm0, %xmm3 387 movdqa %xmm0, %xmm4 388 andps %xmm1, %xmm0 389 390 andps %xmm2, %xmm3 391 pslldq $2, %xmm4 392 psrldq $2, %xmm3 393 andps %xmm2, %xmm4 394 395 orps %xmm4, %xmm3 396 orps %xmm3, %xmm0 397 398 movq %xmm0, (%ecx) 399 addl $8, %ecx 400.L40: 401 402 /* Would it be worth having a specialized version of this loop for 403 * the case where the destination is 16-byte aligned? That version 404 * would be identical except that it could use movedqa instead of 405 * movdqu. 406 */ 407 408 movl %edx, %eax 409 shrl $2, %eax 410 jmp .L42 411.L43: 412 movdqa (%ebx), %xmm0 413 addl $16, %ebx 414 415 movdqa %xmm0, %xmm3 416 movdqa %xmm0, %xmm4 417 andps %xmm1, %xmm0 418 419 andps %xmm2, %xmm3 420 pslldq $2, %xmm4 421 psrldq $2, %xmm3 422 andps %xmm2, %xmm4 423 424 orps %xmm4, %xmm3 425 orps %xmm3, %xmm0 426 427 movdqu %xmm0, (%ecx) 428 addl $16, %ecx 429 subl $1, %eax 430.L42: 431 jne .L43 432 433 434 /* There may be upto 3 pixels remaining to be copied. Take care 435 * of them now. We do the 2 pixel case first because the data 436 * will be aligned. 437 */ 438 439 testl $2, %edx 440 je .L47 441 442 movq (%ebx), %xmm0 443 addl $8, %ebx 444 445 movdqa %xmm0, %xmm3 446 movdqa %xmm0, %xmm4 447 andps %xmm1, %xmm0 448 449 andps %xmm2, %xmm3 450 pslldq $2, %xmm4 451 psrldq $2, %xmm3 452 andps %xmm2, %xmm4 453 454 orps %xmm4, %xmm3 455 orps %xmm3, %xmm0 456 457 movq %xmm0, (%ecx) 458 addl $8, %ecx 459.L47: 460 461 testl $1, %edx 462 je .L46 463 464 DO_ONE_LAST_PIXEL() 465.L46: 466 467 popl %ebx 468 popl %esi 469 ret 470 .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2 471 472 473 474#define MASK_565_L 0x07e0f800 475#define MASK_565_H 0x0000001f 476/* Setting SCALE_ADJUST to 5 gives a perfect match with the 477 * classic C implementation in Mesa. Setting SCALE_ADJUST 478 * to 0 is slightly faster but at a small cost to accuracy. 479 */ 480#define SCALE_ADJUST 5 481#if SCALE_ADJUST == 5 482#define PRESCALE_L 0x00100001 483#define PRESCALE_H 0x00000200 484#define SCALE_L 0x40C620E8 485#define SCALE_H 0x0000839d 486#elif SCALE_ADJUST == 0 487#define PRESCALE_L 0x00200001 488#define PRESCALE_H 0x00000800 489#define SCALE_L 0x01040108 490#define SCALE_H 0x00000108 491#else 492#error SCALE_ADJUST must either be 5 or 0. 493#endif 494#define ALPHA_L 0x00000000 495#define ALPHA_H 0x00ff0000 496 497/** 498 * MMX optimized version of the RGB565 to RGBA copy routine. 499 */ 500 501 .text 502 .globl _generic_read_RGBA_span_RGB565_MMX 503#ifndef USE_DRICORE 504 .hidden _generic_read_RGBA_span_RGB565_MMX 505#endif 506 .type _generic_read_RGBA_span_RGB565_MMX, @function 507 508_generic_read_RGBA_span_RGB565_MMX: 509 510#ifdef USE_INNER_EMMS 511 emms 512#endif 513 514 movl 4(%esp), %eax /* source pointer */ 515 movl 8(%esp), %edx /* destination pointer */ 516 movl 12(%esp), %ecx /* number of pixels to copy */ 517 518 pushl $MASK_565_H 519 pushl $MASK_565_L 520 movq (%esp), %mm5 521 pushl $PRESCALE_H 522 pushl $PRESCALE_L 523 movq (%esp), %mm6 524 pushl $SCALE_H 525 pushl $SCALE_L 526 movq (%esp), %mm7 527 pushl $ALPHA_H 528 pushl $ALPHA_L 529 movq (%esp), %mm3 530 addl $32,%esp 531 532 sarl $2, %ecx 533 jl .L01 /* Bail early if the count is negative. */ 534 jmp .L02 535 536.L03: 537 /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and 538 * second pixels into the four words of %mm0 and %mm2. 539 */ 540 541 movq (%eax), %mm4 542 addl $8, %eax 543 544 pshufw $0x00, %mm4, %mm0 545 pshufw $0x55, %mm4, %mm2 546 547 548 /* Mask the pixels so that each word of each register contains only 549 * one color component. 550 */ 551 552 pand %mm5, %mm0 553 pand %mm5, %mm2 554 555 556 /* Adjust the component values so that they are as small as possible, 557 * but large enough so that we can multiply them by an unsigned 16-bit 558 * number and get a value as large as 0x00ff0000. 559 */ 560 561 pmullw %mm6, %mm0 562 pmullw %mm6, %mm2 563#if SCALE_ADJUST > 0 564 psrlw $SCALE_ADJUST, %mm0 565 psrlw $SCALE_ADJUST, %mm2 566#endif 567 568 /* Scale the input component values to be on the range 569 * [0, 0x00ff0000]. This it the real magic of the whole routine. 570 */ 571 572 pmulhuw %mm7, %mm0 573 pmulhuw %mm7, %mm2 574 575 576 /* Always set the alpha value to 0xff. 577 */ 578 579 por %mm3, %mm0 580 por %mm3, %mm2 581 582 583 /* Pack the 16-bit values to 8-bit values and store the converted 584 * pixel data. 585 */ 586 587 packuswb %mm2, %mm0 588 movq %mm0, (%edx) 589 addl $8, %edx 590 591 pshufw $0xaa, %mm4, %mm0 592 pshufw $0xff, %mm4, %mm2 593 594 pand %mm5, %mm0 595 pand %mm5, %mm2 596 pmullw %mm6, %mm0 597 pmullw %mm6, %mm2 598#if SCALE_ADJUST > 0 599 psrlw $SCALE_ADJUST, %mm0 600 psrlw $SCALE_ADJUST, %mm2 601#endif 602 pmulhuw %mm7, %mm0 603 pmulhuw %mm7, %mm2 604 605 por %mm3, %mm0 606 por %mm3, %mm2 607 608 packuswb %mm2, %mm0 609 610 movq %mm0, (%edx) 611 addl $8, %edx 612 613 subl $1, %ecx 614.L02: 615 jne .L03 616 617 618 /* At this point there can be at most 3 pixels left to process. If 619 * there is either 2 or 3 left, process 2. 620 */ 621 622 movl 12(%esp), %ecx 623 testl $0x02, %ecx 624 je .L04 625 626 movd (%eax), %mm4 627 addl $4, %eax 628 629 pshufw $0x00, %mm4, %mm0 630 pshufw $0x55, %mm4, %mm2 631 632 pand %mm5, %mm0 633 pand %mm5, %mm2 634 pmullw %mm6, %mm0 635 pmullw %mm6, %mm2 636#if SCALE_ADJUST > 0 637 psrlw $SCALE_ADJUST, %mm0 638 psrlw $SCALE_ADJUST, %mm2 639#endif 640 pmulhuw %mm7, %mm0 641 pmulhuw %mm7, %mm2 642 643 por %mm3, %mm0 644 por %mm3, %mm2 645 646 packuswb %mm2, %mm0 647 648 movq %mm0, (%edx) 649 addl $8, %edx 650 651.L04: 652 /* At this point there can be at most 1 pixel left to process. 653 * Process it if needed. 654 */ 655 656 testl $0x01, %ecx 657 je .L01 658 659 movzwl (%eax), %ecx 660 movd %ecx, %mm4 661 662 pshufw $0x00, %mm4, %mm0 663 664 pand %mm5, %mm0 665 pmullw %mm6, %mm0 666#if SCALE_ADJUST > 0 667 psrlw $SCALE_ADJUST, %mm0 668#endif 669 pmulhuw %mm7, %mm0 670 671 por %mm3, %mm0 672 673 packuswb %mm0, %mm0 674 675 movd %mm0, (%edx) 676 677.L01: 678#ifdef USE_INNER_EMMS 679 emms 680#endif 681 ret 682#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */ 683 684#if defined (__ELF__) && defined (__linux__) 685 .section .note.GNU-stack,"",%progbits 686#endif 687