1; 2; jdsample.asm - upsampling (64-bit SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright 2009 D. R. Commander 6; 7; Based on 8; x86 SIMD extension for IJG JPEG library 9; Copyright (C) 1999-2006, MIYASAKA Masaru. 10; For conditions of distribution and use, see copyright notice in jsimdext.inc 11; 12; This file should be assembled with NASM (Netwide Assembler), 13; can *not* be assembled with Microsoft's MASM or any compatible 14; assembler (including Borland's Turbo Assembler). 15; NASM is available from http://nasm.sourceforge.net/ or 16; http://sourceforge.net/project/showfiles.php?group_id=6208 17; 18; [TAB8] 19 20%include "jsimdext.inc" 21 22; -------------------------------------------------------------------------- 23 SECTION SEG_CONST 24 25 alignz 16 26 global EXTN(jconst_fancy_upsample_sse2) 27 28EXTN(jconst_fancy_upsample_sse2): 29 30PW_ONE times 8 dw 1 31PW_TWO times 8 dw 2 32PW_THREE times 8 dw 3 33PW_SEVEN times 8 dw 7 34PW_EIGHT times 8 dw 8 35 36 alignz 16 37 38; -------------------------------------------------------------------------- 39 SECTION SEG_TEXT 40 BITS 64 41; 42; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. 43; 44; The upsampling algorithm is linear interpolation between pixel centers, 45; also known as a "triangle filter". This is a good compromise between 46; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 47; of the way between input pixel centers. 48; 49; GLOBAL(void) 50; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor, 51; JDIMENSION downsampled_width, 52; JSAMPARRAY input_data, 53; JSAMPARRAY * output_data_ptr); 54; 55 56; r10 = int max_v_samp_factor 57; r11 = JDIMENSION downsampled_width 58; r12 = JSAMPARRAY input_data 59; r13 = JSAMPARRAY * output_data_ptr 60 61 align 16 62 global EXTN(jsimd_h2v1_fancy_upsample_sse2) 63 64EXTN(jsimd_h2v1_fancy_upsample_sse2): 65 push rbp 66 mov rax,rsp 67 mov rbp,rsp 68 collect_args 69 70 mov eax, r11d ; colctr 71 test rax,rax 72 jz near .return 73 74 mov rcx, r10 ; rowctr 75 test rcx,rcx 76 jz near .return 77 78 mov rsi, r12 ; input_data 79 mov rdi, r13 80 mov rdi, JSAMPARRAY [rdi] ; output_data 81.rowloop: 82 push rax ; colctr 83 push rdi 84 push rsi 85 86 mov rsi, JSAMPROW [rsi] ; inptr 87 mov rdi, JSAMPROW [rdi] ; outptr 88 89 test rax, SIZEOF_XMMWORD-1 90 jz short .skip 91 mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] 92 mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 93.skip: 94 pxor xmm0,xmm0 ; xmm0=(all 0's) 95 pcmpeqb xmm7,xmm7 96 psrldq xmm7,(SIZEOF_XMMWORD-1) 97 pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD] 98 99 add rax, byte SIZEOF_XMMWORD-1 100 and rax, byte -SIZEOF_XMMWORD 101 cmp rax, byte SIZEOF_XMMWORD 102 ja short .columnloop 103 104.columnloop_last: 105 pcmpeqb xmm6,xmm6 106 pslldq xmm6,(SIZEOF_XMMWORD-1) 107 pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD] 108 jmp short .upsample 109 110.columnloop: 111 movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD] 112 pslldq xmm6,(SIZEOF_XMMWORD-1) 113 114.upsample: 115 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] 116 movdqa xmm2,xmm1 117 movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15) 118 pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14) 119 psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --) 120 121 por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14) 122 por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16) 123 124 movdqa xmm7,xmm1 125 psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) 126 127 movdqa xmm4,xmm1 128 punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) 129 punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) 130 movdqa xmm5,xmm2 131 punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) 132 punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) 133 movdqa xmm6,xmm3 134 punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) 135 punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) 136 137 pmullw xmm1,[rel PW_THREE] 138 pmullw xmm4,[rel PW_THREE] 139 paddw xmm2,[rel PW_ONE] 140 paddw xmm5,[rel PW_ONE] 141 paddw xmm3,[rel PW_TWO] 142 paddw xmm6,[rel PW_TWO] 143 144 paddw xmm2,xmm1 145 paddw xmm5,xmm4 146 psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) 147 psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) 148 paddw xmm3,xmm1 149 paddw xmm6,xmm4 150 psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) 151 psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) 152 153 psllw xmm3,BYTE_BIT 154 psllw xmm6,BYTE_BIT 155 por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) 156 por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) 157 158 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 159 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5 160 161 sub rax, byte SIZEOF_XMMWORD 162 add rsi, byte 1*SIZEOF_XMMWORD ; inptr 163 add rdi, byte 2*SIZEOF_XMMWORD ; outptr 164 cmp rax, byte SIZEOF_XMMWORD 165 ja near .columnloop 166 test eax,eax 167 jnz near .columnloop_last 168 169 pop rsi 170 pop rdi 171 pop rax 172 173 add rsi, byte SIZEOF_JSAMPROW ; input_data 174 add rdi, byte SIZEOF_JSAMPROW ; output_data 175 dec rcx ; rowctr 176 jg near .rowloop 177 178.return: 179 uncollect_args 180 pop rbp 181 ret 182 183; -------------------------------------------------------------------------- 184; 185; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 186; Again a triangle filter; see comments for h2v1 case, above. 187; 188; GLOBAL(void) 189; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor, 190; JDIMENSION downsampled_width, 191; JSAMPARRAY input_data, 192; JSAMPARRAY * output_data_ptr); 193; 194 195; r10 = int max_v_samp_factor 196; r11 = JDIMENSION downsampled_width 197; r12 = JSAMPARRAY input_data 198; r13 = JSAMPARRAY * output_data_ptr 199 200%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 201%define WK_NUM 4 202 203 align 16 204 global EXTN(jsimd_h2v2_fancy_upsample_sse2) 205 206EXTN(jsimd_h2v2_fancy_upsample_sse2): 207 push rbp 208 mov rax,rsp ; rax = original rbp 209 sub rsp, byte 4 210 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 211 mov [rsp],rax 212 mov rbp,rsp ; rbp = aligned rbp 213 lea rsp, [wk(0)] 214 collect_args 215 push rbx 216 217 mov eax, r11d ; colctr 218 test rax,rax 219 jz near .return 220 221 mov rcx, r10 ; rowctr 222 test rcx,rcx 223 jz near .return 224 225 mov rsi, r12 ; input_data 226 mov rdi, r13 227 mov rdi, JSAMPARRAY [rdi] ; output_data 228.rowloop: 229 push rax ; colctr 230 push rcx 231 push rdi 232 push rsi 233 234 mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above) 235 mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 236 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below) 237 mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 238 mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 239 240 test rax, SIZEOF_XMMWORD-1 241 jz short .skip 242 push rdx 243 mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE] 244 mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl 245 mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE] 246 mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl 247 mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] 248 mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 249 pop rdx 250.skip: 251 ; -- process the first column block 252 253 movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] 254 movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] 255 movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] 256 257 pxor xmm3,xmm3 ; xmm3=(all 0's) 258 movdqa xmm4,xmm0 259 punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) 260 punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) 261 movdqa xmm5,xmm1 262 punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) 263 punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) 264 movdqa xmm6,xmm2 265 punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) 266 punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) 267 268 pmullw xmm0,[rel PW_THREE] 269 pmullw xmm4,[rel PW_THREE] 270 271 pcmpeqb xmm7,xmm7 272 psrldq xmm7,(SIZEOF_XMMWORD-2) 273 274 paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) 275 paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) 276 paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) 277 paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) 278 279 movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save 280 movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data 281 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 282 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6 283 284 pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) 285 pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) 286 287 movdqa XMMWORD [wk(0)], xmm1 288 movdqa XMMWORD [wk(1)], xmm2 289 290 add rax, byte SIZEOF_XMMWORD-1 291 and rax, byte -SIZEOF_XMMWORD 292 cmp rax, byte SIZEOF_XMMWORD 293 ja short .columnloop 294 295.columnloop_last: 296 ; -- process the last column block 297 298 pcmpeqb xmm1,xmm1 299 pslldq xmm1,(SIZEOF_XMMWORD-2) 300 movdqa xmm2,xmm1 301 302 pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD] 303 pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD] 304 305 movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) 306 movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) 307 308 jmp near .upsample 309 310.columnloop: 311 ; -- process the next column block 312 313 movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] 314 movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] 315 movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] 316 317 pxor xmm3,xmm3 ; xmm3=(all 0's) 318 movdqa xmm4,xmm0 319 punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) 320 punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) 321 movdqa xmm5,xmm1 322 punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) 323 punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) 324 movdqa xmm6,xmm2 325 punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) 326 punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) 327 328 pmullw xmm0,[rel PW_THREE] 329 pmullw xmm4,[rel PW_THREE] 330 331 paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) 332 paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) 333 paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) 334 paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) 335 336 movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save 337 movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data 338 movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 339 movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6 340 341 pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) 342 pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) 343 344 movdqa XMMWORD [wk(2)], xmm1 345 movdqa XMMWORD [wk(3)], xmm2 346 347.upsample: 348 ; -- process the upper row 349 350 movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD] 351 movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD] 352 353 movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) 354 movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) 355 psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --) 356 pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) 357 movdqa xmm5,xmm7 358 movdqa xmm6,xmm3 359 psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) 360 pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14) 361 362 por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) 363 por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) 364 365 movdqa xmm1,xmm7 366 movdqa xmm2,xmm3 367 pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) 368 psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --) 369 movdqa xmm4,xmm3 370 psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) 371 372 por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) 373 por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) 374 375 movdqa XMMWORD [wk(0)], xmm4 376 377 pmullw xmm7,[rel PW_THREE] 378 pmullw xmm3,[rel PW_THREE] 379 paddw xmm1,[rel PW_EIGHT] 380 paddw xmm5,[rel PW_EIGHT] 381 paddw xmm0,[rel PW_SEVEN] 382 paddw xmm2,[rel PW_SEVEN] 383 384 paddw xmm1,xmm7 385 paddw xmm5,xmm3 386 psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) 387 psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) 388 paddw xmm0,xmm7 389 paddw xmm2,xmm3 390 psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) 391 psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) 392 393 psllw xmm0,BYTE_BIT 394 psllw xmm2,BYTE_BIT 395 por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) 396 por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) 397 398 movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 399 movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 400 401 ; -- process the lower row 402 403 movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD] 404 movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD] 405 406 movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) 407 movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) 408 psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --) 409 pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) 410 movdqa xmm0,xmm6 411 movdqa xmm2,xmm4 412 psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) 413 pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14) 414 415 por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) 416 por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) 417 418 movdqa xmm1,xmm6 419 movdqa xmm5,xmm4 420 pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) 421 psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --) 422 movdqa xmm3,xmm4 423 psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) 424 425 por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) 426 por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) 427 428 movdqa XMMWORD [wk(1)], xmm3 429 430 pmullw xmm6,[rel PW_THREE] 431 pmullw xmm4,[rel PW_THREE] 432 paddw xmm1,[rel PW_EIGHT] 433 paddw xmm0,[rel PW_EIGHT] 434 paddw xmm7,[rel PW_SEVEN] 435 paddw xmm5,[rel PW_SEVEN] 436 437 paddw xmm1,xmm6 438 paddw xmm0,xmm4 439 psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) 440 psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) 441 paddw xmm7,xmm6 442 paddw xmm5,xmm4 443 psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) 444 psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) 445 446 psllw xmm7,BYTE_BIT 447 psllw xmm5,BYTE_BIT 448 por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) 449 por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) 450 451 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1 452 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0 453 454 sub rax, byte SIZEOF_XMMWORD 455 add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above) 456 add rbx, byte 1*SIZEOF_XMMWORD ; inptr0 457 add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below) 458 add rdx, byte 2*SIZEOF_XMMWORD ; outptr0 459 add rdi, byte 2*SIZEOF_XMMWORD ; outptr1 460 cmp rax, byte SIZEOF_XMMWORD 461 ja near .columnloop 462 test rax,rax 463 jnz near .columnloop_last 464 465 pop rsi 466 pop rdi 467 pop rcx 468 pop rax 469 470 add rsi, byte 1*SIZEOF_JSAMPROW ; input_data 471 add rdi, byte 2*SIZEOF_JSAMPROW ; output_data 472 sub rcx, byte 2 ; rowctr 473 jg near .rowloop 474 475.return: 476 pop rbx 477 uncollect_args 478 mov rsp,rbp ; rsp <- aligned rbp 479 pop rsp ; rsp <- original rbp 480 pop rbp 481 ret 482 483; -------------------------------------------------------------------------- 484; 485; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. 486; It's still a box filter. 487; 488; GLOBAL(void) 489; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor, 490; JDIMENSION output_width, 491; JSAMPARRAY input_data, 492; JSAMPARRAY * output_data_ptr); 493; 494 495; r10 = int max_v_samp_factor 496; r11 = JDIMENSION output_width 497; r12 = JSAMPARRAY input_data 498; r13 = JSAMPARRAY * output_data_ptr 499 500 align 16 501 global EXTN(jsimd_h2v1_upsample_sse2) 502 503EXTN(jsimd_h2v1_upsample_sse2): 504 push rbp 505 mov rax,rsp 506 mov rbp,rsp 507 collect_args 508 509 mov edx, r11d 510 add rdx, byte (2*SIZEOF_XMMWORD)-1 511 and rdx, byte -(2*SIZEOF_XMMWORD) 512 jz near .return 513 514 mov rcx, r10 ; rowctr 515 test rcx,rcx 516 jz short .return 517 518 mov rsi, r12 ; input_data 519 mov rdi, r13 520 mov rdi, JSAMPARRAY [rdi] ; output_data 521.rowloop: 522 push rdi 523 push rsi 524 525 mov rsi, JSAMPROW [rsi] ; inptr 526 mov rdi, JSAMPROW [rdi] ; outptr 527 mov rax,rdx ; colctr 528.columnloop: 529 530 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 531 532 movdqa xmm1,xmm0 533 punpcklbw xmm0,xmm0 534 punpckhbw xmm1,xmm1 535 536 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 537 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 538 539 sub rax, byte 2*SIZEOF_XMMWORD 540 jz short .nextrow 541 542 movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] 543 544 movdqa xmm3,xmm2 545 punpcklbw xmm2,xmm2 546 punpckhbw xmm3,xmm3 547 548 movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 549 movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 550 551 sub rax, byte 2*SIZEOF_XMMWORD 552 jz short .nextrow 553 554 add rsi, byte 2*SIZEOF_XMMWORD ; inptr 555 add rdi, byte 4*SIZEOF_XMMWORD ; outptr 556 jmp short .columnloop 557 558.nextrow: 559 pop rsi 560 pop rdi 561 562 add rsi, byte SIZEOF_JSAMPROW ; input_data 563 add rdi, byte SIZEOF_JSAMPROW ; output_data 564 dec rcx ; rowctr 565 jg short .rowloop 566 567.return: 568 uncollect_args 569 pop rbp 570 ret 571 572; -------------------------------------------------------------------------- 573; 574; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. 575; It's still a box filter. 576; 577; GLOBAL(void) 578; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor, 579; JDIMENSION output_width, 580; JSAMPARRAY input_data, 581; JSAMPARRAY * output_data_ptr); 582; 583 584; r10 = int max_v_samp_factor 585; r11 = JDIMENSION output_width 586; r12 = JSAMPARRAY input_data 587; r13 = JSAMPARRAY * output_data_ptr 588 589 align 16 590 global EXTN(jsimd_h2v2_upsample_sse2) 591 592EXTN(jsimd_h2v2_upsample_sse2): 593 push rbp 594 mov rax,rsp 595 mov rbp,rsp 596 collect_args 597 push rbx 598 599 mov edx, r11d 600 add rdx, byte (2*SIZEOF_XMMWORD)-1 601 and rdx, byte -(2*SIZEOF_XMMWORD) 602 jz near .return 603 604 mov rcx, r10 ; rowctr 605 test rcx,rcx 606 jz near .return 607 608 mov rsi, r12 ; input_data 609 mov rdi, r13 610 mov rdi, JSAMPARRAY [rdi] ; output_data 611.rowloop: 612 push rdi 613 push rsi 614 615 mov rsi, JSAMPROW [rsi] ; inptr 616 mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 617 mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 618 mov rax,rdx ; colctr 619.columnloop: 620 621 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 622 623 movdqa xmm1,xmm0 624 punpcklbw xmm0,xmm0 625 punpckhbw xmm1,xmm1 626 627 movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0 628 movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1 629 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 630 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 631 632 sub rax, byte 2*SIZEOF_XMMWORD 633 jz short .nextrow 634 635 movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] 636 637 movdqa xmm3,xmm2 638 punpcklbw xmm2,xmm2 639 punpckhbw xmm3,xmm3 640 641 movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2 642 movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3 643 movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 644 movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 645 646 sub rax, byte 2*SIZEOF_XMMWORD 647 jz short .nextrow 648 649 add rsi, byte 2*SIZEOF_XMMWORD ; inptr 650 add rbx, byte 4*SIZEOF_XMMWORD ; outptr0 651 add rdi, byte 4*SIZEOF_XMMWORD ; outptr1 652 jmp short .columnloop 653 654.nextrow: 655 pop rsi 656 pop rdi 657 658 add rsi, byte 1*SIZEOF_JSAMPROW ; input_data 659 add rdi, byte 2*SIZEOF_JSAMPROW ; output_data 660 sub rcx, byte 2 ; rowctr 661 jg near .rowloop 662 663.return: 664 pop rbx 665 uncollect_args 666 pop rbp 667 ret 668 669; For some reason, the OS X linker does not honor the request to align the 670; segment unless we do this. 671 align 16 672