1; 2; jdsample.asm - upsampling (SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jsimdext.inc" 20 21; -------------------------------------------------------------------------- 22 SECTION SEG_CONST 23 24 alignz 32 25 GLOBAL_DATA(jconst_fancy_upsample_sse2) 26 27EXTN(jconst_fancy_upsample_sse2): 28 29PW_ONE times 8 dw 1 30PW_TWO times 8 dw 2 31PW_THREE times 8 dw 3 32PW_SEVEN times 8 dw 7 33PW_EIGHT times 8 dw 8 34 35 alignz 32 36 37; -------------------------------------------------------------------------- 38 SECTION SEG_TEXT 39 BITS 32 40; 41; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. 42; 43; The upsampling algorithm is linear interpolation between pixel centers, 44; also known as a "triangle filter". This is a good compromise between 45; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 46; of the way between input pixel centers. 47; 48; GLOBAL(void) 49; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor, 50; JDIMENSION downsampled_width, 51; JSAMPARRAY input_data, 52; JSAMPARRAY *output_data_ptr); 53; 54 55%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 56%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width 57%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 58%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 59 60 align 32 61 GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2) 62 63EXTN(jsimd_h2v1_fancy_upsample_sse2): 64 push ebp 65 mov ebp, esp 66 pushpic ebx 67; push ecx ; need not be preserved 68; push edx ; need not be preserved 69 push esi 70 push edi 71 72 get_GOT ebx ; get GOT address 73 74 mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr 75 test eax, eax 76 jz near .return 77 78 mov ecx, INT [max_v_samp(ebp)] ; rowctr 79 test ecx, ecx 80 jz near .return 81 82 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 83 mov edi, POINTER [output_data_ptr(ebp)] 84 mov edi, JSAMPARRAY [edi] ; output_data 85 alignx 16, 7 86.rowloop: 87 push eax ; colctr 88 push edi 89 push esi 90 91 mov esi, JSAMPROW [esi] ; inptr 92 mov edi, JSAMPROW [edi] ; outptr 93 94 test eax, SIZEOF_XMMWORD-1 95 jz short .skip 96 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 97 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 98.skip: 99 pxor xmm0, xmm0 ; xmm0=(all 0's) 100 pcmpeqb xmm7, xmm7 101 psrldq xmm7, (SIZEOF_XMMWORD-1) 102 pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD] 103 104 add eax, byte SIZEOF_XMMWORD-1 105 and eax, byte -SIZEOF_XMMWORD 106 cmp eax, byte SIZEOF_XMMWORD 107 ja short .columnloop 108 alignx 16, 7 109 110.columnloop_last: 111 pcmpeqb xmm6, xmm6 112 pslldq xmm6, (SIZEOF_XMMWORD-1) 113 pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD] 114 jmp short .upsample 115 alignx 16, 7 116 117.columnloop: 118 movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD] 119 pslldq xmm6, (SIZEOF_XMMWORD-1) 120 121.upsample: 122 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] 123 movdqa xmm2, xmm1 124 movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15) 125 pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14) 126 psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --) 127 128 por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14) 129 por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16) 130 131 movdqa xmm7, xmm1 132 psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) 133 134 movdqa xmm4, xmm1 135 punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) 136 punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) 137 movdqa xmm5, xmm2 138 punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) 139 punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) 140 movdqa xmm6, xmm3 141 punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) 142 punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) 143 144 pmullw xmm1, [GOTOFF(ebx,PW_THREE)] 145 pmullw xmm4, [GOTOFF(ebx,PW_THREE)] 146 paddw xmm2, [GOTOFF(ebx,PW_ONE)] 147 paddw xmm5, [GOTOFF(ebx,PW_ONE)] 148 paddw xmm3, [GOTOFF(ebx,PW_TWO)] 149 paddw xmm6, [GOTOFF(ebx,PW_TWO)] 150 151 paddw xmm2, xmm1 152 paddw xmm5, xmm4 153 psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) 154 psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) 155 paddw xmm3, xmm1 156 paddw xmm6, xmm4 157 psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) 158 psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) 159 160 psllw xmm3, BYTE_BIT 161 psllw xmm6, BYTE_BIT 162 por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) 163 por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) 164 165 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 166 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5 167 168 sub eax, byte SIZEOF_XMMWORD 169 add esi, byte 1*SIZEOF_XMMWORD ; inptr 170 add edi, byte 2*SIZEOF_XMMWORD ; outptr 171 cmp eax, byte SIZEOF_XMMWORD 172 ja near .columnloop 173 test eax, eax 174 jnz near .columnloop_last 175 176 pop esi 177 pop edi 178 pop eax 179 180 add esi, byte SIZEOF_JSAMPROW ; input_data 181 add edi, byte SIZEOF_JSAMPROW ; output_data 182 dec ecx ; rowctr 183 jg near .rowloop 184 185.return: 186 pop edi 187 pop esi 188; pop edx ; need not be preserved 189; pop ecx ; need not be preserved 190 poppic ebx 191 pop ebp 192 ret 193 194; -------------------------------------------------------------------------- 195; 196; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 197; Again a triangle filter; see comments for h2v1 case, above. 198; 199; GLOBAL(void) 200; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor, 201; JDIMENSION downsampled_width, 202; JSAMPARRAY input_data, 203; JSAMPARRAY *output_data_ptr); 204; 205 206%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 207%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width 208%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 209%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 210 211%define original_ebp ebp + 0 212%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD 213 ; xmmword wk[WK_NUM] 214%define WK_NUM 4 215%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr 216 217 align 32 218 GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2) 219 220EXTN(jsimd_h2v2_fancy_upsample_sse2): 221 push ebp 222 mov eax, esp ; eax = original ebp 223 sub esp, byte 4 224 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 225 mov [esp], eax 226 mov ebp, esp ; ebp = aligned ebp 227 lea esp, [wk(0)] 228 pushpic eax ; make a room for GOT address 229 push ebx 230; push ecx ; need not be preserved 231; push edx ; need not be preserved 232 push esi 233 push edi 234 235 get_GOT ebx ; get GOT address 236 movpic POINTER [gotptr], ebx ; save GOT address 237 238 mov edx, eax ; edx = original ebp 239 mov eax, JDIMENSION [downsamp_width(edx)] ; colctr 240 test eax, eax 241 jz near .return 242 243 mov ecx, INT [max_v_samp(edx)] ; rowctr 244 test ecx, ecx 245 jz near .return 246 247 mov esi, JSAMPARRAY [input_data(edx)] ; input_data 248 mov edi, POINTER [output_data_ptr(edx)] 249 mov edi, JSAMPARRAY [edi] ; output_data 250 alignx 16, 7 251.rowloop: 252 push eax ; colctr 253 push ecx 254 push edi 255 push esi 256 257 mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) 258 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 259 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) 260 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 261 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 262 263 test eax, SIZEOF_XMMWORD-1 264 jz short .skip 265 push edx 266 mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] 267 mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl 268 mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] 269 mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl 270 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 271 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 272 pop edx 273.skip: 274 ; -- process the first column block 275 276 movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] 277 movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] 278 movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] 279 280 pushpic ebx 281 movpic ebx, POINTER [gotptr] ; load GOT address 282 283 pxor xmm3, xmm3 ; xmm3=(all 0's) 284 movdqa xmm4, xmm0 285 punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) 286 punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) 287 movdqa xmm5, xmm1 288 punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) 289 punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) 290 movdqa xmm6, xmm2 291 punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) 292 punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) 293 294 pmullw xmm0, [GOTOFF(ebx,PW_THREE)] 295 pmullw xmm4, [GOTOFF(ebx,PW_THREE)] 296 297 pcmpeqb xmm7, xmm7 298 psrldq xmm7, (SIZEOF_XMMWORD-2) 299 300 paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) 301 paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) 302 paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) 303 paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) 304 305 movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save 306 movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data 307 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 308 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6 309 310 pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) 311 pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) 312 313 movdqa XMMWORD [wk(0)], xmm1 314 movdqa XMMWORD [wk(1)], xmm2 315 316 poppic ebx 317 318 add eax, byte SIZEOF_XMMWORD-1 319 and eax, byte -SIZEOF_XMMWORD 320 cmp eax, byte SIZEOF_XMMWORD 321 ja short .columnloop 322 alignx 16, 7 323 324.columnloop_last: 325 ; -- process the last column block 326 327 pushpic ebx 328 movpic ebx, POINTER [gotptr] ; load GOT address 329 330 pcmpeqb xmm1, xmm1 331 pslldq xmm1, (SIZEOF_XMMWORD-2) 332 movdqa xmm2, xmm1 333 334 pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD] 335 pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD] 336 337 movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) 338 movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) 339 340 jmp near .upsample 341 alignx 16, 7 342 343.columnloop: 344 ; -- process the next column block 345 346 movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] 347 movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] 348 movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] 349 350 pushpic ebx 351 movpic ebx, POINTER [gotptr] ; load GOT address 352 353 pxor xmm3, xmm3 ; xmm3=(all 0's) 354 movdqa xmm4, xmm0 355 punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) 356 punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) 357 movdqa xmm5, xmm1 358 punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) 359 punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) 360 movdqa xmm6, xmm2 361 punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) 362 punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) 363 364 pmullw xmm0, [GOTOFF(ebx,PW_THREE)] 365 pmullw xmm4, [GOTOFF(ebx,PW_THREE)] 366 367 paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) 368 paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) 369 paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) 370 paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) 371 372 movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save 373 movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data 374 movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 375 movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6 376 377 pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) 378 pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) 379 380 movdqa XMMWORD [wk(2)], xmm1 381 movdqa XMMWORD [wk(3)], xmm2 382 383.upsample: 384 ; -- process the upper row 385 386 movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD] 387 movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD] 388 389 movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) 390 movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) 391 psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --) 392 pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) 393 movdqa xmm5, xmm7 394 movdqa xmm6, xmm3 395 psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) 396 pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14) 397 398 por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) 399 por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) 400 401 movdqa xmm1, xmm7 402 movdqa xmm2, xmm3 403 pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) 404 psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --) 405 movdqa xmm4, xmm3 406 psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) 407 408 por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) 409 por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) 410 411 movdqa XMMWORD [wk(0)], xmm4 412 413 pmullw xmm7, [GOTOFF(ebx,PW_THREE)] 414 pmullw xmm3, [GOTOFF(ebx,PW_THREE)] 415 paddw xmm1, [GOTOFF(ebx,PW_EIGHT)] 416 paddw xmm5, [GOTOFF(ebx,PW_EIGHT)] 417 paddw xmm0, [GOTOFF(ebx,PW_SEVEN)] 418 paddw xmm2, [GOTOFF(ebx,PW_SEVEN)] 419 420 paddw xmm1, xmm7 421 paddw xmm5, xmm3 422 psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) 423 psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) 424 paddw xmm0, xmm7 425 paddw xmm2, xmm3 426 psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) 427 psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) 428 429 psllw xmm0, BYTE_BIT 430 psllw xmm2, BYTE_BIT 431 por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) 432 por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) 433 434 movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 435 movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 436 437 ; -- process the lower row 438 439 movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD] 440 movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD] 441 442 movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) 443 movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) 444 psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --) 445 pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) 446 movdqa xmm0, xmm6 447 movdqa xmm2, xmm4 448 psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) 449 pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14) 450 451 por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) 452 por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) 453 454 movdqa xmm1, xmm6 455 movdqa xmm5, xmm4 456 pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) 457 psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --) 458 movdqa xmm3, xmm4 459 psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) 460 461 por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) 462 por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) 463 464 movdqa XMMWORD [wk(1)], xmm3 465 466 pmullw xmm6, [GOTOFF(ebx,PW_THREE)] 467 pmullw xmm4, [GOTOFF(ebx,PW_THREE)] 468 paddw xmm1, [GOTOFF(ebx,PW_EIGHT)] 469 paddw xmm0, [GOTOFF(ebx,PW_EIGHT)] 470 paddw xmm7, [GOTOFF(ebx,PW_SEVEN)] 471 paddw xmm5, [GOTOFF(ebx,PW_SEVEN)] 472 473 paddw xmm1, xmm6 474 paddw xmm0, xmm4 475 psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) 476 psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) 477 paddw xmm7, xmm6 478 paddw xmm5, xmm4 479 psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) 480 psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) 481 482 psllw xmm7, BYTE_BIT 483 psllw xmm5, BYTE_BIT 484 por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) 485 por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) 486 487 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1 488 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0 489 490 poppic ebx 491 492 sub eax, byte SIZEOF_XMMWORD 493 add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above) 494 add ebx, byte 1*SIZEOF_XMMWORD ; inptr0 495 add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below) 496 add edx, byte 2*SIZEOF_XMMWORD ; outptr0 497 add edi, byte 2*SIZEOF_XMMWORD ; outptr1 498 cmp eax, byte SIZEOF_XMMWORD 499 ja near .columnloop 500 test eax, eax 501 jnz near .columnloop_last 502 503 pop esi 504 pop edi 505 pop ecx 506 pop eax 507 508 add esi, byte 1*SIZEOF_JSAMPROW ; input_data 509 add edi, byte 2*SIZEOF_JSAMPROW ; output_data 510 sub ecx, byte 2 ; rowctr 511 jg near .rowloop 512 513.return: 514 pop edi 515 pop esi 516; pop edx ; need not be preserved 517; pop ecx ; need not be preserved 518 pop ebx 519 mov esp, ebp ; esp <- aligned ebp 520 pop esp ; esp <- original ebp 521 pop ebp 522 ret 523 524; -------------------------------------------------------------------------- 525; 526; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. 527; It's still a box filter. 528; 529; GLOBAL(void) 530; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width, 531; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 532; 533 534%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 535%define output_width(b) (b) + 12 ; JDIMENSION output_width 536%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 537%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 538 539 align 32 540 GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2) 541 542EXTN(jsimd_h2v1_upsample_sse2): 543 push ebp 544 mov ebp, esp 545; push ebx ; unused 546; push ecx ; need not be preserved 547; push edx ; need not be preserved 548 push esi 549 push edi 550 551 mov edx, JDIMENSION [output_width(ebp)] 552 add edx, byte (2*SIZEOF_XMMWORD)-1 553 and edx, byte -(2*SIZEOF_XMMWORD) 554 jz short .return 555 556 mov ecx, INT [max_v_samp(ebp)] ; rowctr 557 test ecx, ecx 558 jz short .return 559 560 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 561 mov edi, POINTER [output_data_ptr(ebp)] 562 mov edi, JSAMPARRAY [edi] ; output_data 563 alignx 16, 7 564.rowloop: 565 push edi 566 push esi 567 568 mov esi, JSAMPROW [esi] ; inptr 569 mov edi, JSAMPROW [edi] ; outptr 570 mov eax, edx ; colctr 571 alignx 16, 7 572.columnloop: 573 574 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 575 576 movdqa xmm1, xmm0 577 punpcklbw xmm0, xmm0 578 punpckhbw xmm1, xmm1 579 580 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 581 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 582 583 sub eax, byte 2*SIZEOF_XMMWORD 584 jz short .nextrow 585 586 movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] 587 588 movdqa xmm3, xmm2 589 punpcklbw xmm2, xmm2 590 punpckhbw xmm3, xmm3 591 592 movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 593 movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 594 595 sub eax, byte 2*SIZEOF_XMMWORD 596 jz short .nextrow 597 598 add esi, byte 2*SIZEOF_XMMWORD ; inptr 599 add edi, byte 4*SIZEOF_XMMWORD ; outptr 600 jmp short .columnloop 601 alignx 16, 7 602 603.nextrow: 604 pop esi 605 pop edi 606 607 add esi, byte SIZEOF_JSAMPROW ; input_data 608 add edi, byte SIZEOF_JSAMPROW ; output_data 609 dec ecx ; rowctr 610 jg short .rowloop 611 612.return: 613 pop edi 614 pop esi 615; pop edx ; need not be preserved 616; pop ecx ; need not be preserved 617; pop ebx ; unused 618 pop ebp 619 ret 620 621; -------------------------------------------------------------------------- 622; 623; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. 624; It's still a box filter. 625; 626; GLOBAL(void) 627; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width, 628; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 629; 630 631%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 632%define output_width(b) (b) + 12 ; JDIMENSION output_width 633%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 634%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 635 636 align 32 637 GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2) 638 639EXTN(jsimd_h2v2_upsample_sse2): 640 push ebp 641 mov ebp, esp 642 push ebx 643; push ecx ; need not be preserved 644; push edx ; need not be preserved 645 push esi 646 push edi 647 648 mov edx, JDIMENSION [output_width(ebp)] 649 add edx, byte (2*SIZEOF_XMMWORD)-1 650 and edx, byte -(2*SIZEOF_XMMWORD) 651 jz near .return 652 653 mov ecx, INT [max_v_samp(ebp)] ; rowctr 654 test ecx, ecx 655 jz near .return 656 657 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 658 mov edi, POINTER [output_data_ptr(ebp)] 659 mov edi, JSAMPARRAY [edi] ; output_data 660 alignx 16, 7 661.rowloop: 662 push edi 663 push esi 664 665 mov esi, JSAMPROW [esi] ; inptr 666 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 667 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 668 mov eax, edx ; colctr 669 alignx 16, 7 670.columnloop: 671 672 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 673 674 movdqa xmm1, xmm0 675 punpcklbw xmm0, xmm0 676 punpckhbw xmm1, xmm1 677 678 movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0 679 movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1 680 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 681 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 682 683 sub eax, byte 2*SIZEOF_XMMWORD 684 jz short .nextrow 685 686 movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] 687 688 movdqa xmm3, xmm2 689 punpcklbw xmm2, xmm2 690 punpckhbw xmm3, xmm3 691 692 movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2 693 movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3 694 movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 695 movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 696 697 sub eax, byte 2*SIZEOF_XMMWORD 698 jz short .nextrow 699 700 add esi, byte 2*SIZEOF_XMMWORD ; inptr 701 add ebx, byte 4*SIZEOF_XMMWORD ; outptr0 702 add edi, byte 4*SIZEOF_XMMWORD ; outptr1 703 jmp short .columnloop 704 alignx 16, 7 705 706.nextrow: 707 pop esi 708 pop edi 709 710 add esi, byte 1*SIZEOF_JSAMPROW ; input_data 711 add edi, byte 2*SIZEOF_JSAMPROW ; output_data 712 sub ecx, byte 2 ; rowctr 713 jg short .rowloop 714 715.return: 716 pop edi 717 pop esi 718; pop edx ; need not be preserved 719; pop ecx ; need not be preserved 720 pop ebx 721 pop ebp 722 ret 723 724; For some reason, the OS X linker does not honor the request to align the 725; segment unless we do this. 726 align 32 727