1; 2; jdsample.asm - upsampling (MMX) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; 6; Based on the x86 SIMD extension for IJG JPEG library 7; Copyright (C) 1999-2006, MIYASAKA Masaru. 8; For conditions of distribution and use, see copyright notice in jsimdext.inc 9; 10; This file should be assembled with NASM (Netwide Assembler), 11; can *not* be assembled with Microsoft's MASM or any compatible 12; assembler (including Borland's Turbo Assembler). 13; NASM is available from http://nasm.sourceforge.net/ or 14; http://sourceforge.net/project/showfiles.php?group_id=6208 15; 16; [TAB8] 17 18%include "jsimdext.inc" 19 20; -------------------------------------------------------------------------- 21 SECTION SEG_CONST 22 23 alignz 16 24 global EXTN(jconst_fancy_upsample_mmx) 25 26EXTN(jconst_fancy_upsample_mmx): 27 28PW_ONE times 4 dw 1 29PW_TWO times 4 dw 2 30PW_THREE times 4 dw 3 31PW_SEVEN times 4 dw 7 32PW_EIGHT times 4 dw 8 33 34 alignz 16 35 36; -------------------------------------------------------------------------- 37 SECTION SEG_TEXT 38 BITS 32 39; 40; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. 41; 42; The upsampling algorithm is linear interpolation between pixel centers, 43; also known as a "triangle filter". This is a good compromise between 44; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 45; of the way between input pixel centers. 46; 47; GLOBAL(void) 48; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor, 49; JDIMENSION downsampled_width, 50; JSAMPARRAY input_data, 51; JSAMPARRAY *output_data_ptr); 52; 53 54%define max_v_samp(b) (b)+8 ; int max_v_samp_factor 55%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width 56%define input_data(b) (b)+16 ; JSAMPARRAY input_data 57%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr 58 59 align 16 60 global EXTN(jsimd_h2v1_fancy_upsample_mmx) 61 62EXTN(jsimd_h2v1_fancy_upsample_mmx): 63 push ebp 64 mov ebp,esp 65 pushpic ebx 66; push ecx ; need not be preserved 67; push edx ; need not be preserved 68 push esi 69 push edi 70 71 get_GOT ebx ; get GOT address 72 73 mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr 74 test eax,eax 75 jz near .return 76 77 mov ecx, INT [max_v_samp(ebp)] ; rowctr 78 test ecx,ecx 79 jz near .return 80 81 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 82 mov edi, POINTER [output_data_ptr(ebp)] 83 mov edi, JSAMPARRAY [edi] ; output_data 84 alignx 16,7 85.rowloop: 86 push eax ; colctr 87 push edi 88 push esi 89 90 mov esi, JSAMPROW [esi] ; inptr 91 mov edi, JSAMPROW [edi] ; outptr 92 93 test eax, SIZEOF_MMWORD-1 94 jz short .skip 95 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 96 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 97.skip: 98 pxor mm0,mm0 ; mm0=(all 0's) 99 pcmpeqb mm7,mm7 100 psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT 101 pand mm7, MMWORD [esi+0*SIZEOF_MMWORD] 102 103 add eax, byte SIZEOF_MMWORD-1 104 and eax, byte -SIZEOF_MMWORD 105 cmp eax, byte SIZEOF_MMWORD 106 ja short .columnloop 107 alignx 16,7 108 109.columnloop_last: 110 pcmpeqb mm6,mm6 111 psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT 112 pand mm6, MMWORD [esi+0*SIZEOF_MMWORD] 113 jmp short .upsample 114 alignx 16,7 115 116.columnloop: 117 movq mm6, MMWORD [esi+1*SIZEOF_MMWORD] 118 psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT 119 120.upsample: 121 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] 122 movq mm2,mm1 123 movq mm3,mm1 ; mm1=( 0 1 2 3 4 5 6 7) 124 psllq mm2,BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6) 125 psrlq mm3,BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -) 126 127 por mm2,mm7 ; mm2=(-1 0 1 2 3 4 5 6) 128 por mm3,mm6 ; mm3=( 1 2 3 4 5 6 7 8) 129 130 movq mm7,mm1 131 psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -) 132 133 movq mm4,mm1 134 punpcklbw mm1,mm0 ; mm1=( 0 1 2 3) 135 punpckhbw mm4,mm0 ; mm4=( 4 5 6 7) 136 movq mm5,mm2 137 punpcklbw mm2,mm0 ; mm2=(-1 0 1 2) 138 punpckhbw mm5,mm0 ; mm5=( 3 4 5 6) 139 movq mm6,mm3 140 punpcklbw mm3,mm0 ; mm3=( 1 2 3 4) 141 punpckhbw mm6,mm0 ; mm6=( 5 6 7 8) 142 143 pmullw mm1,[GOTOFF(ebx,PW_THREE)] 144 pmullw mm4,[GOTOFF(ebx,PW_THREE)] 145 paddw mm2,[GOTOFF(ebx,PW_ONE)] 146 paddw mm5,[GOTOFF(ebx,PW_ONE)] 147 paddw mm3,[GOTOFF(ebx,PW_TWO)] 148 paddw mm6,[GOTOFF(ebx,PW_TWO)] 149 150 paddw mm2,mm1 151 paddw mm5,mm4 152 psrlw mm2,2 ; mm2=OutLE=( 0 2 4 6) 153 psrlw mm5,2 ; mm5=OutHE=( 8 10 12 14) 154 paddw mm3,mm1 155 paddw mm6,mm4 156 psrlw mm3,2 ; mm3=OutLO=( 1 3 5 7) 157 psrlw mm6,2 ; mm6=OutHO=( 9 11 13 15) 158 159 psllw mm3,BYTE_BIT 160 psllw mm6,BYTE_BIT 161 por mm2,mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7) 162 por mm5,mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15) 163 164 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 165 movq MMWORD [edi+1*SIZEOF_MMWORD], mm5 166 167 sub eax, byte SIZEOF_MMWORD 168 add esi, byte 1*SIZEOF_MMWORD ; inptr 169 add edi, byte 2*SIZEOF_MMWORD ; outptr 170 cmp eax, byte SIZEOF_MMWORD 171 ja near .columnloop 172 test eax,eax 173 jnz near .columnloop_last 174 175 pop esi 176 pop edi 177 pop eax 178 179 add esi, byte SIZEOF_JSAMPROW ; input_data 180 add edi, byte SIZEOF_JSAMPROW ; output_data 181 dec ecx ; rowctr 182 jg near .rowloop 183 184 emms ; empty MMX state 185 186.return: 187 pop edi 188 pop esi 189; pop edx ; need not be preserved 190; pop ecx ; need not be preserved 191 poppic ebx 192 pop ebp 193 ret 194 195; -------------------------------------------------------------------------- 196; 197; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 198; Again a triangle filter; see comments for h2v1 case, above. 199; 200; GLOBAL(void) 201; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor, 202; JDIMENSION downsampled_width, 203; JSAMPARRAY input_data, 204; JSAMPARRAY *output_data_ptr); 205; 206 207%define max_v_samp(b) (b)+8 ; int max_v_samp_factor 208%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width 209%define input_data(b) (b)+16 ; JSAMPARRAY input_data 210%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr 211 212%define original_ebp ebp+0 213%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] 214%define WK_NUM 4 215%define gotptr wk(0)-SIZEOF_POINTER ; void *gotptr 216 217 align 16 218 global EXTN(jsimd_h2v2_fancy_upsample_mmx) 219 220EXTN(jsimd_h2v2_fancy_upsample_mmx): 221 push ebp 222 mov eax,esp ; eax = original ebp 223 sub esp, byte 4 224 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 225 mov [esp],eax 226 mov ebp,esp ; ebp = aligned ebp 227 lea esp, [wk(0)] 228 pushpic eax ; make a room for GOT address 229 push ebx 230; push ecx ; need not be preserved 231; push edx ; need not be preserved 232 push esi 233 push edi 234 235 get_GOT ebx ; get GOT address 236 movpic POINTER [gotptr], ebx ; save GOT address 237 238 mov edx,eax ; edx = original ebp 239 mov eax, JDIMENSION [downsamp_width(edx)] ; colctr 240 test eax,eax 241 jz near .return 242 243 mov ecx, INT [max_v_samp(edx)] ; rowctr 244 test ecx,ecx 245 jz near .return 246 247 mov esi, JSAMPARRAY [input_data(edx)] ; input_data 248 mov edi, POINTER [output_data_ptr(edx)] 249 mov edi, JSAMPARRAY [edi] ; output_data 250 alignx 16,7 251.rowloop: 252 push eax ; colctr 253 push ecx 254 push edi 255 push esi 256 257 mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) 258 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 259 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) 260 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 261 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 262 263 test eax, SIZEOF_MMWORD-1 264 jz short .skip 265 push edx 266 mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] 267 mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl 268 mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] 269 mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl 270 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 271 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 272 pop edx 273.skip: 274 ; -- process the first column block 275 276 movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0] 277 movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0] 278 movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0] 279 280 pushpic ebx 281 movpic ebx, POINTER [gotptr] ; load GOT address 282 283 pxor mm3,mm3 ; mm3=(all 0's) 284 movq mm4,mm0 285 punpcklbw mm0,mm3 ; mm0=row[ 0][0]( 0 1 2 3) 286 punpckhbw mm4,mm3 ; mm4=row[ 0][0]( 4 5 6 7) 287 movq mm5,mm1 288 punpcklbw mm1,mm3 ; mm1=row[-1][0]( 0 1 2 3) 289 punpckhbw mm5,mm3 ; mm5=row[-1][0]( 4 5 6 7) 290 movq mm6,mm2 291 punpcklbw mm2,mm3 ; mm2=row[+1][0]( 0 1 2 3) 292 punpckhbw mm6,mm3 ; mm6=row[+1][0]( 4 5 6 7) 293 294 pmullw mm0,[GOTOFF(ebx,PW_THREE)] 295 pmullw mm4,[GOTOFF(ebx,PW_THREE)] 296 297 pcmpeqb mm7,mm7 298 psrlq mm7,(SIZEOF_MMWORD-2)*BYTE_BIT 299 300 paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3) 301 paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7) 302 paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3) 303 paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7) 304 305 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save 306 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data 307 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 308 movq MMWORD [edi+1*SIZEOF_MMWORD], mm6 309 310 pand mm1,mm7 ; mm1=( 0 - - -) 311 pand mm2,mm7 ; mm2=( 0 - - -) 312 313 movq MMWORD [wk(0)], mm1 314 movq MMWORD [wk(1)], mm2 315 316 poppic ebx 317 318 add eax, byte SIZEOF_MMWORD-1 319 and eax, byte -SIZEOF_MMWORD 320 cmp eax, byte SIZEOF_MMWORD 321 ja short .columnloop 322 alignx 16,7 323 324.columnloop_last: 325 ; -- process the last column block 326 327 pushpic ebx 328 movpic ebx, POINTER [gotptr] ; load GOT address 329 330 pcmpeqb mm1,mm1 331 psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT 332 movq mm2,mm1 333 334 pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7) 335 pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7) 336 337 movq MMWORD [wk(2)], mm1 338 movq MMWORD [wk(3)], mm2 339 340 jmp short .upsample 341 alignx 16,7 342 343.columnloop: 344 ; -- process the next column block 345 346 movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1] 347 movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1] 348 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1] 349 350 pushpic ebx 351 movpic ebx, POINTER [gotptr] ; load GOT address 352 353 pxor mm3,mm3 ; mm3=(all 0's) 354 movq mm4,mm0 355 punpcklbw mm0,mm3 ; mm0=row[ 0][1]( 0 1 2 3) 356 punpckhbw mm4,mm3 ; mm4=row[ 0][1]( 4 5 6 7) 357 movq mm5,mm1 358 punpcklbw mm1,mm3 ; mm1=row[-1][1]( 0 1 2 3) 359 punpckhbw mm5,mm3 ; mm5=row[-1][1]( 4 5 6 7) 360 movq mm6,mm2 361 punpcklbw mm2,mm3 ; mm2=row[+1][1]( 0 1 2 3) 362 punpckhbw mm6,mm3 ; mm6=row[+1][1]( 4 5 6 7) 363 364 pmullw mm0,[GOTOFF(ebx,PW_THREE)] 365 pmullw mm4,[GOTOFF(ebx,PW_THREE)] 366 367 paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3) 368 paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7) 369 paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3) 370 paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7) 371 372 movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save 373 movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data 374 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 375 movq MMWORD [edi+3*SIZEOF_MMWORD], mm6 376 377 psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0) 378 psllq mm2,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0) 379 380 movq MMWORD [wk(2)], mm1 381 movq MMWORD [wk(3)], mm2 382 383.upsample: 384 ; -- process the upper row 385 386 movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3) 387 movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7) 388 389 movq mm0,mm7 390 movq mm4,mm3 391 psrlq mm0,2*BYTE_BIT ; mm0=( 1 2 3 -) 392 psllq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4) 393 movq mm5,mm7 394 movq mm6,mm3 395 psrlq mm5,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -) 396 psllq mm6,2*BYTE_BIT ; mm6=( - 4 5 6) 397 398 por mm0,mm4 ; mm0=( 1 2 3 4) 399 por mm5,mm6 ; mm5=( 3 4 5 6) 400 401 movq mm1,mm7 402 movq mm2,mm3 403 psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2) 404 psrlq mm2,2*BYTE_BIT ; mm2=( 5 6 7 -) 405 movq mm4,mm3 406 psrlq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -) 407 408 por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2) 409 por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8) 410 411 movq MMWORD [wk(0)], mm4 412 413 pmullw mm7,[GOTOFF(ebx,PW_THREE)] 414 pmullw mm3,[GOTOFF(ebx,PW_THREE)] 415 paddw mm1,[GOTOFF(ebx,PW_EIGHT)] 416 paddw mm5,[GOTOFF(ebx,PW_EIGHT)] 417 paddw mm0,[GOTOFF(ebx,PW_SEVEN)] 418 paddw mm2,[GOTOFF(ebx,PW_SEVEN)] 419 420 paddw mm1,mm7 421 paddw mm5,mm3 422 psrlw mm1,4 ; mm1=Out0LE=( 0 2 4 6) 423 psrlw mm5,4 ; mm5=Out0HE=( 8 10 12 14) 424 paddw mm0,mm7 425 paddw mm2,mm3 426 psrlw mm0,4 ; mm0=Out0LO=( 1 3 5 7) 427 psrlw mm2,4 ; mm2=Out0HO=( 9 11 13 15) 428 429 psllw mm0,BYTE_BIT 430 psllw mm2,BYTE_BIT 431 por mm1,mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7) 432 por mm5,mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15) 433 434 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 435 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 436 437 ; -- process the lower row 438 439 movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3) 440 movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7) 441 442 movq mm7,mm6 443 movq mm3,mm4 444 psrlq mm7,2*BYTE_BIT ; mm7=( 1 2 3 -) 445 psllq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4) 446 movq mm0,mm6 447 movq mm2,mm4 448 psrlq mm0,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -) 449 psllq mm2,2*BYTE_BIT ; mm2=( - 4 5 6) 450 451 por mm7,mm3 ; mm7=( 1 2 3 4) 452 por mm0,mm2 ; mm0=( 3 4 5 6) 453 454 movq mm1,mm6 455 movq mm5,mm4 456 psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2) 457 psrlq mm5,2*BYTE_BIT ; mm5=( 5 6 7 -) 458 movq mm3,mm4 459 psrlq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -) 460 461 por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2) 462 por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8) 463 464 movq MMWORD [wk(1)], mm3 465 466 pmullw mm6,[GOTOFF(ebx,PW_THREE)] 467 pmullw mm4,[GOTOFF(ebx,PW_THREE)] 468 paddw mm1,[GOTOFF(ebx,PW_EIGHT)] 469 paddw mm0,[GOTOFF(ebx,PW_EIGHT)] 470 paddw mm7,[GOTOFF(ebx,PW_SEVEN)] 471 paddw mm5,[GOTOFF(ebx,PW_SEVEN)] 472 473 paddw mm1,mm6 474 paddw mm0,mm4 475 psrlw mm1,4 ; mm1=Out1LE=( 0 2 4 6) 476 psrlw mm0,4 ; mm0=Out1HE=( 8 10 12 14) 477 paddw mm7,mm6 478 paddw mm5,mm4 479 psrlw mm7,4 ; mm7=Out1LO=( 1 3 5 7) 480 psrlw mm5,4 ; mm5=Out1HO=( 9 11 13 15) 481 482 psllw mm7,BYTE_BIT 483 psllw mm5,BYTE_BIT 484 por mm1,mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7) 485 por mm0,mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15) 486 487 movq MMWORD [edi+0*SIZEOF_MMWORD], mm1 488 movq MMWORD [edi+1*SIZEOF_MMWORD], mm0 489 490 poppic ebx 491 492 sub eax, byte SIZEOF_MMWORD 493 add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above) 494 add ebx, byte 1*SIZEOF_MMWORD ; inptr0 495 add esi, byte 1*SIZEOF_MMWORD ; inptr1(below) 496 add edx, byte 2*SIZEOF_MMWORD ; outptr0 497 add edi, byte 2*SIZEOF_MMWORD ; outptr1 498 cmp eax, byte SIZEOF_MMWORD 499 ja near .columnloop 500 test eax,eax 501 jnz near .columnloop_last 502 503 pop esi 504 pop edi 505 pop ecx 506 pop eax 507 508 add esi, byte 1*SIZEOF_JSAMPROW ; input_data 509 add edi, byte 2*SIZEOF_JSAMPROW ; output_data 510 sub ecx, byte 2 ; rowctr 511 jg near .rowloop 512 513 emms ; empty MMX state 514 515.return: 516 pop edi 517 pop esi 518; pop edx ; need not be preserved 519; pop ecx ; need not be preserved 520 pop ebx 521 mov esp,ebp ; esp <- aligned ebp 522 pop esp ; esp <- original ebp 523 pop ebp 524 ret 525 526; -------------------------------------------------------------------------- 527; 528; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. 529; It's still a box filter. 530; 531; GLOBAL(void) 532; jsimd_h2v1_upsample_mmx (int max_v_samp_factor, 533; JDIMENSION output_width, 534; JSAMPARRAY input_data, 535; JSAMPARRAY *output_data_ptr); 536; 537 538%define max_v_samp(b) (b)+8 ; int max_v_samp_factor 539%define output_width(b) (b)+12 ; JDIMENSION output_width 540%define input_data(b) (b)+16 ; JSAMPARRAY input_data 541%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr 542 543 align 16 544 global EXTN(jsimd_h2v1_upsample_mmx) 545 546EXTN(jsimd_h2v1_upsample_mmx): 547 push ebp 548 mov ebp,esp 549; push ebx ; unused 550; push ecx ; need not be preserved 551; push edx ; need not be preserved 552 push esi 553 push edi 554 555 mov edx, JDIMENSION [output_width(ebp)] 556 add edx, byte (2*SIZEOF_MMWORD)-1 557 and edx, byte -(2*SIZEOF_MMWORD) 558 jz short .return 559 560 mov ecx, INT [max_v_samp(ebp)] ; rowctr 561 test ecx,ecx 562 jz short .return 563 564 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 565 mov edi, POINTER [output_data_ptr(ebp)] 566 mov edi, JSAMPARRAY [edi] ; output_data 567 alignx 16,7 568.rowloop: 569 push edi 570 push esi 571 572 mov esi, JSAMPROW [esi] ; inptr 573 mov edi, JSAMPROW [edi] ; outptr 574 mov eax,edx ; colctr 575 alignx 16,7 576.columnloop: 577 578 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] 579 580 movq mm1,mm0 581 punpcklbw mm0,mm0 582 punpckhbw mm1,mm1 583 584 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 585 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 586 587 sub eax, byte 2*SIZEOF_MMWORD 588 jz short .nextrow 589 590 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] 591 592 movq mm3,mm2 593 punpcklbw mm2,mm2 594 punpckhbw mm3,mm3 595 596 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 597 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 598 599 sub eax, byte 2*SIZEOF_MMWORD 600 jz short .nextrow 601 602 add esi, byte 2*SIZEOF_MMWORD ; inptr 603 add edi, byte 4*SIZEOF_MMWORD ; outptr 604 jmp short .columnloop 605 alignx 16,7 606 607.nextrow: 608 pop esi 609 pop edi 610 611 add esi, byte SIZEOF_JSAMPROW ; input_data 612 add edi, byte SIZEOF_JSAMPROW ; output_data 613 dec ecx ; rowctr 614 jg short .rowloop 615 616 emms ; empty MMX state 617 618.return: 619 pop edi 620 pop esi 621; pop edx ; need not be preserved 622; pop ecx ; need not be preserved 623; pop ebx ; unused 624 pop ebp 625 ret 626 627; -------------------------------------------------------------------------- 628; 629; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. 630; It's still a box filter. 631; 632; GLOBAL(void) 633; jsimd_h2v2_upsample_mmx (int max_v_samp_factor, 634; JDIMENSION output_width, 635; JSAMPARRAY input_data, 636; JSAMPARRAY *output_data_ptr); 637; 638 639%define max_v_samp(b) (b)+8 ; int max_v_samp_factor 640%define output_width(b) (b)+12 ; JDIMENSION output_width 641%define input_data(b) (b)+16 ; JSAMPARRAY input_data 642%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr 643 644 align 16 645 global EXTN(jsimd_h2v2_upsample_mmx) 646 647EXTN(jsimd_h2v2_upsample_mmx): 648 push ebp 649 mov ebp,esp 650 push ebx 651; push ecx ; need not be preserved 652; push edx ; need not be preserved 653 push esi 654 push edi 655 656 mov edx, JDIMENSION [output_width(ebp)] 657 add edx, byte (2*SIZEOF_MMWORD)-1 658 and edx, byte -(2*SIZEOF_MMWORD) 659 jz near .return 660 661 mov ecx, INT [max_v_samp(ebp)] ; rowctr 662 test ecx,ecx 663 jz short .return 664 665 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 666 mov edi, POINTER [output_data_ptr(ebp)] 667 mov edi, JSAMPARRAY [edi] ; output_data 668 alignx 16,7 669.rowloop: 670 push edi 671 push esi 672 673 mov esi, JSAMPROW [esi] ; inptr 674 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 675 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 676 mov eax,edx ; colctr 677 alignx 16,7 678.columnloop: 679 680 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] 681 682 movq mm1,mm0 683 punpcklbw mm0,mm0 684 punpckhbw mm1,mm1 685 686 movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0 687 movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1 688 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 689 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 690 691 sub eax, byte 2*SIZEOF_MMWORD 692 jz short .nextrow 693 694 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] 695 696 movq mm3,mm2 697 punpcklbw mm2,mm2 698 punpckhbw mm3,mm3 699 700 movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2 701 movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3 702 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 703 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 704 705 sub eax, byte 2*SIZEOF_MMWORD 706 jz short .nextrow 707 708 add esi, byte 2*SIZEOF_MMWORD ; inptr 709 add ebx, byte 4*SIZEOF_MMWORD ; outptr0 710 add edi, byte 4*SIZEOF_MMWORD ; outptr1 711 jmp short .columnloop 712 alignx 16,7 713 714.nextrow: 715 pop esi 716 pop edi 717 718 add esi, byte 1*SIZEOF_JSAMPROW ; input_data 719 add edi, byte 2*SIZEOF_JSAMPROW ; output_data 720 sub ecx, byte 2 ; rowctr 721 jg short .rowloop 722 723 emms ; empty MMX state 724 725.return: 726 pop edi 727 pop esi 728; pop edx ; need not be preserved 729; pop ecx ; need not be preserved 730 pop ebx 731 pop ebp 732 ret 733 734; For some reason, the OS X linker does not honor the request to align the 735; segment unless we do this. 736 align 16 737