1; 2; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; This file contains a floating-point implementation of the inverse DCT 18; (Discrete Cosine Transform). The following code is based directly on 19; the IJG's original jidctflt.c; see the jidctflt.c for more details. 20; 21; [TAB8] 22 23%include "jsimdext.inc" 24%include "jdct.inc" 25 26; -------------------------------------------------------------------------- 27 28%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) 29 shufps %1, %2, 0x44 30%endmacro 31 32%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) 33 shufps %1, %2, 0xEE 34%endmacro 35 36; -------------------------------------------------------------------------- 37 SECTION SEG_CONST 38 39 alignz 32 40 GLOBAL_DATA(jconst_idct_float_sse2) 41 42EXTN(jconst_idct_float_sse2): 43 44PD_1_414 times 4 dd 1.414213562373095048801689 45PD_1_847 times 4 dd 1.847759065022573512256366 46PD_1_082 times 4 dd 1.082392200292393968799446 47PD_M2_613 times 4 dd -2.613125929752753055713286 48PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) 49PB_CENTERJSAMP times 16 db CENTERJSAMPLE 50 51 alignz 32 52 53; -------------------------------------------------------------------------- 54 SECTION SEG_TEXT 55 BITS 64 56; 57; Perform dequantization and inverse DCT on one block of coefficients. 58; 59; GLOBAL(void) 60; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block, 61; JSAMPARRAY output_buf, JDIMENSION output_col) 62; 63 64; r10 = void *dct_table 65; r11 = JCOEFPTR coef_block 66; r12 = JSAMPARRAY output_buf 67; r13d = JDIMENSION output_col 68 69%define original_rbp rbp + 0 70%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD 71 ; xmmword wk[WK_NUM] 72%define WK_NUM 2 73%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT 74 ; FAST_FLOAT workspace[DCTSIZE2] 75 76 align 32 77 GLOBAL_FUNCTION(jsimd_idct_float_sse2) 78 79EXTN(jsimd_idct_float_sse2): 80 push rbp 81 mov rax, rsp ; rax = original rbp 82 sub rsp, byte 4 83 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 84 mov [rsp], rax 85 mov rbp, rsp ; rbp = aligned rbp 86 lea rsp, [workspace] 87 collect_args 4 88 push rbx 89 90 ; ---- Pass 1: process columns from input, store into work array. 91 92 mov rdx, r10 ; quantptr 93 mov rsi, r11 ; inptr 94 lea rdi, [workspace] ; FAST_FLOAT *wsptr 95 mov rcx, DCTSIZE/4 ; ctr 96.columnloop: 97%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE 98 mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] 99 or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] 100 jnz near .columnDCT 101 102 movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] 103 movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] 104 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] 105 movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] 106 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] 107 movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] 108 movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] 109 por xmm1, xmm2 110 por xmm3, xmm4 111 por xmm5, xmm6 112 por xmm1, xmm3 113 por xmm5, xmm7 114 por xmm1, xmm5 115 packsswb xmm1, xmm1 116 movd eax, xmm1 117 test rax, rax 118 jnz short .columnDCT 119 120 ; -- AC terms all zero 121 122 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] 123 124 punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 125 psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 126 cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) 127 128 mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 129 130 movaps xmm1, xmm0 131 movaps xmm2, xmm0 132 movaps xmm3, xmm0 133 134 shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00) 135 shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01) 136 shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02) 137 shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03) 138 139 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 140 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 141 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 142 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 143 movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 144 movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2 145 movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 146 movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 147 jmp near .nextcolumn 148%endif 149.columnDCT: 150 151 ; -- Even part 152 153 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] 154 movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] 155 movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] 156 movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] 157 158 punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 159 punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23) 160 psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 161 psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) 162 cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) 163 cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23) 164 165 punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43) 166 punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63) 167 psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) 168 psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) 169 cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43) 170 cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63) 171 172 mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 173 mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 174 mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 175 mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 176 177 movaps xmm4, xmm0 178 movaps xmm5, xmm1 179 subps xmm0, xmm2 ; xmm0=tmp11 180 subps xmm1, xmm3 181 addps xmm4, xmm2 ; xmm4=tmp10 182 addps xmm5, xmm3 ; xmm5=tmp13 183 184 mulps xmm1, [rel PD_1_414] 185 subps xmm1, xmm5 ; xmm1=tmp12 186 187 movaps xmm6, xmm4 188 movaps xmm7, xmm0 189 subps xmm4, xmm5 ; xmm4=tmp3 190 subps xmm0, xmm1 ; xmm0=tmp2 191 addps xmm6, xmm5 ; xmm6=tmp0 192 addps xmm7, xmm1 ; xmm7=tmp1 193 194 movaps XMMWORD [wk(1)], xmm4 ; tmp3 195 movaps XMMWORD [wk(0)], xmm0 ; tmp2 196 197 ; -- Odd part 198 199 movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] 200 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] 201 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] 202 movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] 203 204 punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13) 205 punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33) 206 psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) 207 psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) 208 cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13) 209 cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33) 210 211 punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53) 212 punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73) 213 psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) 214 psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) 215 cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53) 216 cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73) 217 218 mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 219 mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 220 mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 221 mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 222 223 movaps xmm4, xmm2 224 movaps xmm0, xmm5 225 addps xmm2, xmm1 ; xmm2=z11 226 addps xmm5, xmm3 ; xmm5=z13 227 subps xmm4, xmm1 ; xmm4=z12 228 subps xmm0, xmm3 ; xmm0=z10 229 230 movaps xmm1, xmm2 231 subps xmm2, xmm5 232 addps xmm1, xmm5 ; xmm1=tmp7 233 234 mulps xmm2, [rel PD_1_414] ; xmm2=tmp11 235 236 movaps xmm3, xmm0 237 addps xmm0, xmm4 238 mulps xmm0, [rel PD_1_847] ; xmm0=z5 239 mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930) 240 mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200) 241 addps xmm3, xmm0 ; xmm3=tmp12 242 subps xmm4, xmm0 ; xmm4=tmp10 243 244 ; -- Final output stage 245 246 subps xmm3, xmm1 ; xmm3=tmp6 247 movaps xmm5, xmm6 248 movaps xmm0, xmm7 249 addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03) 250 addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13) 251 subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73) 252 subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63) 253 subps xmm2, xmm3 ; xmm2=tmp5 254 255 movaps xmm1, xmm6 ; transpose coefficients(phase 1) 256 unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) 257 unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13) 258 movaps xmm3, xmm0 ; transpose coefficients(phase 1) 259 unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71) 260 unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73) 261 262 movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 263 movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 264 265 movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) 266 movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) 267 268 addps xmm4, xmm2 ; xmm4=tmp4 269 movaps xmm0, xmm7 270 movaps xmm3, xmm5 271 addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23) 272 addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43) 273 subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53) 274 subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33) 275 276 movaps xmm2, xmm7 ; transpose coefficients(phase 1) 277 unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31) 278 unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33) 279 movaps xmm4, xmm5 ; transpose coefficients(phase 1) 280 unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51) 281 unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) 282 283 movaps xmm3, xmm6 ; transpose coefficients(phase 2) 284 unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) 285 unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) 286 movaps xmm0, xmm1 ; transpose coefficients(phase 2) 287 unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) 288 unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) 289 290 movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) 291 movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) 292 293 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6 294 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 295 movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 296 movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 297 298 movaps xmm6, xmm5 ; transpose coefficients(phase 2) 299 unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) 300 unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) 301 movaps xmm3, xmm4 ; transpose coefficients(phase 2) 302 unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) 303 unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) 304 305 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5 306 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6 307 movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4 308 movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 309 310.nextcolumn: 311 add rsi, byte 4*SIZEOF_JCOEF ; coef_block 312 add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr 313 add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr 314 dec rcx ; ctr 315 jnz near .columnloop 316 317 ; -- Prefetch the next coefficient block 318 319 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] 320 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] 321 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] 322 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] 323 324 ; ---- Pass 2: process rows from work array, store into output array. 325 326 mov rax, [original_rbp] 327 lea rsi, [workspace] ; FAST_FLOAT *wsptr 328 mov rdi, r12 ; (JSAMPROW *) 329 mov eax, r13d 330 mov rcx, DCTSIZE/4 ; ctr 331.rowloop: 332 333 ; -- Even part 334 335 movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] 336 movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)] 337 movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)] 338 movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)] 339 340 movaps xmm4, xmm0 341 movaps xmm5, xmm1 342 subps xmm0, xmm2 ; xmm0=tmp11 343 subps xmm1, xmm3 344 addps xmm4, xmm2 ; xmm4=tmp10 345 addps xmm5, xmm3 ; xmm5=tmp13 346 347 mulps xmm1, [rel PD_1_414] 348 subps xmm1, xmm5 ; xmm1=tmp12 349 350 movaps xmm6, xmm4 351 movaps xmm7, xmm0 352 subps xmm4, xmm5 ; xmm4=tmp3 353 subps xmm0, xmm1 ; xmm0=tmp2 354 addps xmm6, xmm5 ; xmm6=tmp0 355 addps xmm7, xmm1 ; xmm7=tmp1 356 357 movaps XMMWORD [wk(1)], xmm4 ; tmp3 358 movaps XMMWORD [wk(0)], xmm0 ; tmp2 359 360 ; -- Odd part 361 362 movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] 363 movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)] 364 movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)] 365 movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)] 366 367 movaps xmm4, xmm2 368 movaps xmm0, xmm5 369 addps xmm2, xmm1 ; xmm2=z11 370 addps xmm5, xmm3 ; xmm5=z13 371 subps xmm4, xmm1 ; xmm4=z12 372 subps xmm0, xmm3 ; xmm0=z10 373 374 movaps xmm1, xmm2 375 subps xmm2, xmm5 376 addps xmm1, xmm5 ; xmm1=tmp7 377 378 mulps xmm2, [rel PD_1_414] ; xmm2=tmp11 379 380 movaps xmm3, xmm0 381 addps xmm0, xmm4 382 mulps xmm0, [rel PD_1_847] ; xmm0=z5 383 mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930) 384 mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200) 385 addps xmm3, xmm0 ; xmm3=tmp12 386 subps xmm4, xmm0 ; xmm4=tmp10 387 388 ; -- Final output stage 389 390 subps xmm3, xmm1 ; xmm3=tmp6 391 movaps xmm5, xmm6 392 movaps xmm0, xmm7 393 addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30) 394 addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31) 395 subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37) 396 subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36) 397 subps xmm2, xmm3 ; xmm2=tmp5 398 399 movaps xmm1, [rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC] 400 pcmpeqd xmm3, xmm3 401 psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} 402 403 addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) 404 addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) 405 addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) 406 addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) 407 408 pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) 409 pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) 410 pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) 411 pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) 412 por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31) 413 por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37) 414 415 movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 416 movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 417 418 addps xmm4, xmm2 ; xmm4=tmp4 419 movaps xmm7, xmm1 420 movaps xmm5, xmm3 421 addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32) 422 addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34) 423 subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35) 424 subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33) 425 426 movaps xmm2, [rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC] 427 pcmpeqd xmm4, xmm4 428 psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} 429 430 addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) 431 addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) 432 addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) 433 addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) 434 435 pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) 436 pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) 437 pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) 438 pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) 439 por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35) 440 por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33) 441 442 movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] 443 444 packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) 445 packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) 446 paddb xmm6, xmm2 447 paddb xmm1, xmm2 448 449 movdqa xmm4, xmm6 ; transpose coefficients(phase 2) 450 punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) 451 punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) 452 453 movdqa xmm7, xmm6 ; transpose coefficients(phase 3) 454 punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) 455 punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) 456 457 pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) 458 pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) 459 460 mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] 461 mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] 462 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 463 movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7 464 mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] 465 mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] 466 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 467 movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3 468 469 add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr 470 add rdi, byte 4*SIZEOF_JSAMPROW 471 dec rcx ; ctr 472 jnz near .rowloop 473 474 pop rbx 475 uncollect_args 4 476 mov rsp, rbp ; rsp <- aligned rbp 477 pop rsp ; rsp <- original rbp 478 pop rbp 479 ret 480 481; For some reason, the OS X linker does not honor the request to align the 482; segment unless we do this. 483 align 32 484