1; 2; jidctflt.asm - floating-point IDCT (SSE & SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; 6; Based on 7; x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; This file contains a floating-point implementation of the inverse DCT 18; (Discrete Cosine Transform). The following code is based directly on 19; the IJG's original jidctflt.c; see the jidctflt.c for more details. 20; 21; [TAB8] 22 23%include "jsimdext.inc" 24%include "jdct.inc" 25 26; -------------------------------------------------------------------------- 27 28%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) 29 shufps %1,%2,0x44 30%endmacro 31 32%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) 33 shufps %1,%2,0xEE 34%endmacro 35 36; -------------------------------------------------------------------------- 37 SECTION SEG_CONST 38 39 alignz 16 40 global EXTN(jconst_idct_float_sse2) 41 42EXTN(jconst_idct_float_sse2): 43 44PD_1_414 times 4 dd 1.414213562373095048801689 45PD_1_847 times 4 dd 1.847759065022573512256366 46PD_1_082 times 4 dd 1.082392200292393968799446 47PD_M2_613 times 4 dd -2.613125929752753055713286 48PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) 49PB_CENTERJSAMP times 16 db CENTERJSAMPLE 50 51 alignz 16 52 53; -------------------------------------------------------------------------- 54 SECTION SEG_TEXT 55 BITS 32 56; 57; Perform dequantization and inverse DCT on one block of coefficients. 58; 59; GLOBAL(void) 60; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block, 61; JSAMPARRAY output_buf, JDIMENSION output_col) 62; 63 64%define dct_table(b) (b)+8 ; void * dct_table 65%define coef_block(b) (b)+12 ; JCOEFPTR coef_block 66%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf 67%define output_col(b) (b)+20 ; JDIMENSION output_col 68 69%define original_ebp ebp+0 70%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 71%define WK_NUM 2 72%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT 73 ; FAST_FLOAT workspace[DCTSIZE2] 74 75 align 16 76 global EXTN(jsimd_idct_float_sse2) 77 78EXTN(jsimd_idct_float_sse2): 79 push ebp 80 mov eax,esp ; eax = original ebp 81 sub esp, byte 4 82 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 83 mov [esp],eax 84 mov ebp,esp ; ebp = aligned ebp 85 lea esp, [workspace] 86 push ebx 87; push ecx ; need not be preserved 88; push edx ; need not be preserved 89 push esi 90 push edi 91 92 get_GOT ebx ; get GOT address 93 94 ; ---- Pass 1: process columns from input, store into work array. 95 96; mov eax, [original_ebp] 97 mov edx, POINTER [dct_table(eax)] ; quantptr 98 mov esi, JCOEFPTR [coef_block(eax)] ; inptr 99 lea edi, [workspace] ; FAST_FLOAT * wsptr 100 mov ecx, DCTSIZE/4 ; ctr 101 alignx 16,7 102.columnloop: 103%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE 104 mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 105 or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 106 jnz near .columnDCT 107 108 movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 109 movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 110 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 111 movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 112 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 113 movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 114 movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 115 por xmm1,xmm2 116 por xmm3,xmm4 117 por xmm5,xmm6 118 por xmm1,xmm3 119 por xmm5,xmm7 120 por xmm1,xmm5 121 packsswb xmm1,xmm1 122 movd eax,xmm1 123 test eax,eax 124 jnz short .columnDCT 125 126 ; -- AC terms all zero 127 128 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 129 130 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 131 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 132 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) 133 134 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 135 136 movaps xmm1,xmm0 137 movaps xmm2,xmm0 138 movaps xmm3,xmm0 139 140 shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) 141 shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) 142 shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) 143 shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) 144 145 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 146 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 147 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 148 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 149 movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 150 movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 151 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 152 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 153 jmp near .nextcolumn 154 alignx 16,7 155%endif 156.columnDCT: 157 158 ; -- Even part 159 160 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 161 movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 162 movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 163 movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 164 165 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 166 punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23) 167 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 168 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) 169 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) 170 cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23) 171 172 punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43) 173 punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63) 174 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) 175 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) 176 cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43) 177 cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63) 178 179 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 180 mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 181 mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 182 mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 183 184 movaps xmm4,xmm0 185 movaps xmm5,xmm1 186 subps xmm0,xmm2 ; xmm0=tmp11 187 subps xmm1,xmm3 188 addps xmm4,xmm2 ; xmm4=tmp10 189 addps xmm5,xmm3 ; xmm5=tmp13 190 191 mulps xmm1,[GOTOFF(ebx,PD_1_414)] 192 subps xmm1,xmm5 ; xmm1=tmp12 193 194 movaps xmm6,xmm4 195 movaps xmm7,xmm0 196 subps xmm4,xmm5 ; xmm4=tmp3 197 subps xmm0,xmm1 ; xmm0=tmp2 198 addps xmm6,xmm5 ; xmm6=tmp0 199 addps xmm7,xmm1 ; xmm7=tmp1 200 201 movaps XMMWORD [wk(1)], xmm4 ; tmp3 202 movaps XMMWORD [wk(0)], xmm0 ; tmp2 203 204 ; -- Odd part 205 206 movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 207 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 208 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 209 movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 210 211 punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13) 212 punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33) 213 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) 214 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) 215 cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13) 216 cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33) 217 218 punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53) 219 punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73) 220 psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) 221 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) 222 cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53) 223 cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73) 224 225 mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 226 mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 227 mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 228 mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 229 230 movaps xmm4,xmm2 231 movaps xmm0,xmm5 232 addps xmm2,xmm1 ; xmm2=z11 233 addps xmm5,xmm3 ; xmm5=z13 234 subps xmm4,xmm1 ; xmm4=z12 235 subps xmm0,xmm3 ; xmm0=z10 236 237 movaps xmm1,xmm2 238 subps xmm2,xmm5 239 addps xmm1,xmm5 ; xmm1=tmp7 240 241 mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 242 243 movaps xmm3,xmm0 244 addps xmm0,xmm4 245 mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 246 mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) 247 mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) 248 addps xmm3,xmm0 ; xmm3=tmp12 249 subps xmm4,xmm0 ; xmm4=tmp10 250 251 ; -- Final output stage 252 253 subps xmm3,xmm1 ; xmm3=tmp6 254 movaps xmm5,xmm6 255 movaps xmm0,xmm7 256 addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) 257 addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) 258 subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) 259 subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) 260 subps xmm2,xmm3 ; xmm2=tmp5 261 262 movaps xmm1,xmm6 ; transpose coefficients(phase 1) 263 unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) 264 unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) 265 movaps xmm3,xmm0 ; transpose coefficients(phase 1) 266 unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) 267 unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) 268 269 movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 270 movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 271 272 movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) 273 movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) 274 275 addps xmm4,xmm2 ; xmm4=tmp4 276 movaps xmm0,xmm7 277 movaps xmm3,xmm5 278 addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) 279 addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) 280 subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) 281 subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) 282 283 movaps xmm2,xmm7 ; transpose coefficients(phase 1) 284 unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) 285 unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) 286 movaps xmm4,xmm5 ; transpose coefficients(phase 1) 287 unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) 288 unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) 289 290 movaps xmm3,xmm6 ; transpose coefficients(phase 2) 291 unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) 292 unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) 293 movaps xmm0,xmm1 ; transpose coefficients(phase 2) 294 unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) 295 unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) 296 297 movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) 298 movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) 299 300 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 301 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 302 movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 303 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 304 305 movaps xmm6,xmm5 ; transpose coefficients(phase 2) 306 unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) 307 unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) 308 movaps xmm3,xmm4 ; transpose coefficients(phase 2) 309 unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) 310 unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) 311 312 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 313 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 314 movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 315 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 316 317.nextcolumn: 318 add esi, byte 4*SIZEOF_JCOEF ; coef_block 319 add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr 320 add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr 321 dec ecx ; ctr 322 jnz near .columnloop 323 324 ; -- Prefetch the next coefficient block 325 326 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] 327 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] 328 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] 329 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] 330 331 ; ---- Pass 2: process rows from work array, store into output array. 332 333 mov eax, [original_ebp] 334 lea esi, [workspace] ; FAST_FLOAT * wsptr 335 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) 336 mov eax, JDIMENSION [output_col(eax)] 337 mov ecx, DCTSIZE/4 ; ctr 338 alignx 16,7 339.rowloop: 340 341 ; -- Even part 342 343 movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 344 movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] 345 movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] 346 movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] 347 348 movaps xmm4,xmm0 349 movaps xmm5,xmm1 350 subps xmm0,xmm2 ; xmm0=tmp11 351 subps xmm1,xmm3 352 addps xmm4,xmm2 ; xmm4=tmp10 353 addps xmm5,xmm3 ; xmm5=tmp13 354 355 mulps xmm1,[GOTOFF(ebx,PD_1_414)] 356 subps xmm1,xmm5 ; xmm1=tmp12 357 358 movaps xmm6,xmm4 359 movaps xmm7,xmm0 360 subps xmm4,xmm5 ; xmm4=tmp3 361 subps xmm0,xmm1 ; xmm0=tmp2 362 addps xmm6,xmm5 ; xmm6=tmp0 363 addps xmm7,xmm1 ; xmm7=tmp1 364 365 movaps XMMWORD [wk(1)], xmm4 ; tmp3 366 movaps XMMWORD [wk(0)], xmm0 ; tmp2 367 368 ; -- Odd part 369 370 movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 371 movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] 372 movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] 373 movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] 374 375 movaps xmm4,xmm2 376 movaps xmm0,xmm5 377 addps xmm2,xmm1 ; xmm2=z11 378 addps xmm5,xmm3 ; xmm5=z13 379 subps xmm4,xmm1 ; xmm4=z12 380 subps xmm0,xmm3 ; xmm0=z10 381 382 movaps xmm1,xmm2 383 subps xmm2,xmm5 384 addps xmm1,xmm5 ; xmm1=tmp7 385 386 mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 387 388 movaps xmm3,xmm0 389 addps xmm0,xmm4 390 mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 391 mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) 392 mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) 393 addps xmm3,xmm0 ; xmm3=tmp12 394 subps xmm4,xmm0 ; xmm4=tmp10 395 396 ; -- Final output stage 397 398 subps xmm3,xmm1 ; xmm3=tmp6 399 movaps xmm5,xmm6 400 movaps xmm0,xmm7 401 addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) 402 addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) 403 subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) 404 subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) 405 subps xmm2,xmm3 ; xmm2=tmp5 406 407 movaps xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC] 408 pcmpeqd xmm3,xmm3 409 psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} 410 411 addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) 412 addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) 413 addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) 414 addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) 415 416 pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) 417 pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) 418 pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) 419 pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) 420 por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31) 421 por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37) 422 423 movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 424 movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 425 426 addps xmm4,xmm2 ; xmm4=tmp4 427 movaps xmm7,xmm1 428 movaps xmm5,xmm3 429 addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32) 430 addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34) 431 subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35) 432 subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33) 433 434 movaps xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC] 435 pcmpeqd xmm4,xmm4 436 psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} 437 438 addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) 439 addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) 440 addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) 441 addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) 442 443 pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) 444 pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) 445 pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) 446 pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) 447 por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35) 448 por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33) 449 450 movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] 451 452 packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) 453 packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) 454 paddb xmm6,xmm2 455 paddb xmm1,xmm2 456 457 movdqa xmm4,xmm6 ; transpose coefficients(phase 2) 458 punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) 459 punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) 460 461 movdqa xmm7,xmm6 ; transpose coefficients(phase 3) 462 punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) 463 punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) 464 465 pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) 466 pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) 467 468 pushpic ebx ; save GOT address 469 470 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] 471 mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] 472 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 473 movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7 474 mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] 475 mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] 476 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 477 movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3 478 479 poppic ebx ; restore GOT address 480 481 add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr 482 add edi, byte 4*SIZEOF_JSAMPROW 483 dec ecx ; ctr 484 jnz near .rowloop 485 486 pop edi 487 pop esi 488; pop edx ; need not be preserved 489; pop ecx ; need not be preserved 490 pop ebx 491 mov esp,ebp ; esp <- aligned ebp 492 pop esp ; esp <- original ebp 493 pop ebp 494 ret 495 496; For some reason, the OS X linker does not honor the request to align the 497; segment unless we do this. 498 align 16 499