1; 2; jfdctint.asm - accurate integer FDCT (MMX) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, 2020, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; This file contains a slower but more accurate integer implementation of the 18; forward DCT (Discrete Cosine Transform). The following code is based 19; directly on the IJG's original jfdctint.c; see the jfdctint.c for 20; more details. 21 22%include "jsimdext.inc" 23%include "jdct.inc" 24 25; -------------------------------------------------------------------------- 26 27%define CONST_BITS 13 28%define PASS1_BITS 2 29 30%define DESCALE_P1 (CONST_BITS - PASS1_BITS) 31%define DESCALE_P2 (CONST_BITS + PASS1_BITS) 32 33%if CONST_BITS == 13 34F_0_298 equ 2446 ; FIX(0.298631336) 35F_0_390 equ 3196 ; FIX(0.390180644) 36F_0_541 equ 4433 ; FIX(0.541196100) 37F_0_765 equ 6270 ; FIX(0.765366865) 38F_0_899 equ 7373 ; FIX(0.899976223) 39F_1_175 equ 9633 ; FIX(1.175875602) 40F_1_501 equ 12299 ; FIX(1.501321110) 41F_1_847 equ 15137 ; FIX(1.847759065) 42F_1_961 equ 16069 ; FIX(1.961570560) 43F_2_053 equ 16819 ; FIX(2.053119869) 44F_2_562 equ 20995 ; FIX(2.562915447) 45F_3_072 equ 25172 ; FIX(3.072711026) 46%else 47; NASM cannot do compile-time arithmetic on floating-point constants. 48%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) 49F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) 50F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) 51F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) 52F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) 53F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) 54F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) 55F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) 56F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) 57F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) 58F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) 59F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) 60F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) 61%endif 62 63; -------------------------------------------------------------------------- 64 SECTION SEG_CONST 65 66 alignz 32 67 GLOBAL_DATA(jconst_fdct_islow_mmx) 68 69EXTN(jconst_fdct_islow_mmx): 70 71PW_F130_F054 times 2 dw (F_0_541 + F_0_765), F_0_541 72PW_F054_MF130 times 2 dw F_0_541, (F_0_541 - F_1_847) 73PW_MF078_F117 times 2 dw (F_1_175 - F_1_961), F_1_175 74PW_F117_F078 times 2 dw F_1_175, (F_1_175 - F_0_390) 75PW_MF060_MF089 times 2 dw (F_0_298 - F_0_899), -F_0_899 76PW_MF089_F060 times 2 dw -F_0_899, (F_1_501 - F_0_899) 77PW_MF050_MF256 times 2 dw (F_2_053 - F_2_562), -F_2_562 78PW_MF256_F050 times 2 dw -F_2_562, (F_3_072 - F_2_562) 79PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1 - 1) 80PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2 - 1) 81PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS - 1) 82 83 alignz 32 84 85; -------------------------------------------------------------------------- 86 SECTION SEG_TEXT 87 BITS 32 88; 89; Perform the forward DCT on one block of samples. 90; 91; GLOBAL(void) 92; jsimd_fdct_islow_mmx(DCTELEM *data) 93; 94 95%define data(b) (b) + 8 ; DCTELEM *data 96 97%define original_ebp ebp + 0 98%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM] 99%define WK_NUM 2 100 101 align 32 102 GLOBAL_FUNCTION(jsimd_fdct_islow_mmx) 103 104EXTN(jsimd_fdct_islow_mmx): 105 push ebp 106 mov eax, esp ; eax = original ebp 107 sub esp, byte 4 108 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 109 mov [esp], eax 110 mov ebp, esp ; ebp = aligned ebp 111 lea esp, [wk(0)] 112 pushpic ebx 113; push ecx ; need not be preserved 114; push edx ; need not be preserved 115; push esi ; unused 116; push edi ; unused 117 118 get_GOT ebx ; get GOT address 119 120 ; ---- Pass 1: process rows. 121 122 mov edx, POINTER [data(eax)] ; (DCTELEM *) 123 mov ecx, DCTSIZE/4 124 alignx 16, 7 125.rowloop: 126 127 movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] 128 movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] 129 movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] 130 movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] 131 132 ; mm0=(20 21 22 23), mm2=(24 25 26 27) 133 ; mm1=(30 31 32 33), mm3=(34 35 36 37) 134 135 movq mm4, mm0 ; transpose coefficients(phase 1) 136 punpcklwd mm0, mm1 ; mm0=(20 30 21 31) 137 punpckhwd mm4, mm1 ; mm4=(22 32 23 33) 138 movq mm5, mm2 ; transpose coefficients(phase 1) 139 punpcklwd mm2, mm3 ; mm2=(24 34 25 35) 140 punpckhwd mm5, mm3 ; mm5=(26 36 27 37) 141 142 movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] 143 movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] 144 movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] 145 movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] 146 147 ; mm6=(00 01 02 03), mm1=(04 05 06 07) 148 ; mm7=(10 11 12 13), mm3=(14 15 16 17) 149 150 movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) 151 movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) 152 153 movq mm4, mm6 ; transpose coefficients(phase 1) 154 punpcklwd mm6, mm7 ; mm6=(00 10 01 11) 155 punpckhwd mm4, mm7 ; mm4=(02 12 03 13) 156 movq mm2, mm1 ; transpose coefficients(phase 1) 157 punpcklwd mm1, mm3 ; mm1=(04 14 05 15) 158 punpckhwd mm2, mm3 ; mm2=(06 16 07 17) 159 160 movq mm7, mm6 ; transpose coefficients(phase 2) 161 punpckldq mm6, mm0 ; mm6=(00 10 20 30)=data0 162 punpckhdq mm7, mm0 ; mm7=(01 11 21 31)=data1 163 movq mm3, mm2 ; transpose coefficients(phase 2) 164 punpckldq mm2, mm5 ; mm2=(06 16 26 36)=data6 165 punpckhdq mm3, mm5 ; mm3=(07 17 27 37)=data7 166 167 movq mm0, mm7 168 movq mm5, mm6 169 psubw mm7, mm2 ; mm7=data1-data6=tmp6 170 psubw mm6, mm3 ; mm6=data0-data7=tmp7 171 paddw mm0, mm2 ; mm0=data1+data6=tmp1 172 paddw mm5, mm3 ; mm5=data0+data7=tmp0 173 174 movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) 175 movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) 176 movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 177 movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 178 179 movq mm7, mm4 ; transpose coefficients(phase 2) 180 punpckldq mm4, mm2 ; mm4=(02 12 22 32)=data2 181 punpckhdq mm7, mm2 ; mm7=(03 13 23 33)=data3 182 movq mm6, mm1 ; transpose coefficients(phase 2) 183 punpckldq mm1, mm3 ; mm1=(04 14 24 34)=data4 184 punpckhdq mm6, mm3 ; mm6=(05 15 25 35)=data5 185 186 movq mm2, mm7 187 movq mm3, mm4 188 paddw mm7, mm1 ; mm7=data3+data4=tmp3 189 paddw mm4, mm6 ; mm4=data2+data5=tmp2 190 psubw mm2, mm1 ; mm2=data3-data4=tmp4 191 psubw mm3, mm6 ; mm3=data2-data5=tmp5 192 193 ; -- Even part 194 195 movq mm1, mm5 196 movq mm6, mm0 197 paddw mm5, mm7 ; mm5=tmp10 198 paddw mm0, mm4 ; mm0=tmp11 199 psubw mm1, mm7 ; mm1=tmp13 200 psubw mm6, mm4 ; mm6=tmp12 201 202 movq mm7, mm5 203 paddw mm5, mm0 ; mm5=tmp10+tmp11 204 psubw mm7, mm0 ; mm7=tmp10-tmp11 205 206 psllw mm5, PASS1_BITS ; mm5=data0 207 psllw mm7, PASS1_BITS ; mm7=data4 208 209 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 210 movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7 211 212 ; (Original) 213 ; z1 = (tmp12 + tmp13) * 0.541196100; 214 ; data2 = z1 + tmp13 * 0.765366865; 215 ; data6 = z1 + tmp12 * -1.847759065; 216 ; 217 ; (This implementation) 218 ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; 219 ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); 220 221 movq mm4, mm1 ; mm1=tmp13 222 movq mm0, mm1 223 punpcklwd mm4, mm6 ; mm6=tmp12 224 punpckhwd mm0, mm6 225 movq mm1, mm4 226 movq mm6, mm0 227 pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L 228 pmaddwd mm0, [GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H 229 pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L 230 pmaddwd mm6, [GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H 231 232 paddd mm4, [GOTOFF(ebx,PD_DESCALE_P1)] 233 paddd mm0, [GOTOFF(ebx,PD_DESCALE_P1)] 234 psrad mm4, DESCALE_P1 235 psrad mm0, DESCALE_P1 236 paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)] 237 paddd mm6, [GOTOFF(ebx,PD_DESCALE_P1)] 238 psrad mm1, DESCALE_P1 239 psrad mm6, DESCALE_P1 240 241 packssdw mm4, mm0 ; mm4=data2 242 packssdw mm1, mm6 ; mm1=data6 243 244 movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 245 movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1 246 247 ; -- Odd part 248 249 movq mm5, MMWORD [wk(0)] ; mm5=tmp6 250 movq mm7, MMWORD [wk(1)] ; mm7=tmp7 251 252 movq mm0, mm2 ; mm2=tmp4 253 movq mm6, mm3 ; mm3=tmp5 254 paddw mm0, mm5 ; mm0=z3 255 paddw mm6, mm7 ; mm6=z4 256 257 ; (Original) 258 ; z5 = (z3 + z4) * 1.175875602; 259 ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; 260 ; z3 += z5; z4 += z5; 261 ; 262 ; (This implementation) 263 ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; 264 ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); 265 266 movq mm4, mm0 267 movq mm1, mm0 268 punpcklwd mm4, mm6 269 punpckhwd mm1, mm6 270 movq mm0, mm4 271 movq mm6, mm1 272 pmaddwd mm4, [GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L 273 pmaddwd mm1, [GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H 274 pmaddwd mm0, [GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L 275 pmaddwd mm6, [GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H 276 277 movq MMWORD [wk(0)], mm4 ; wk(0)=z3L 278 movq MMWORD [wk(1)], mm1 ; wk(1)=z3H 279 280 ; (Original) 281 ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; 282 ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; 283 ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; 284 ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; 285 ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; 286 ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; 287 ; 288 ; (This implementation) 289 ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; 290 ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; 291 ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); 292 ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); 293 ; data7 = tmp4 + z3; data5 = tmp5 + z4; 294 ; data3 = tmp6 + z3; data1 = tmp7 + z4; 295 296 movq mm4, mm2 297 movq mm1, mm2 298 punpcklwd mm4, mm7 299 punpckhwd mm1, mm7 300 movq mm2, mm4 301 movq mm7, mm1 302 pmaddwd mm4, [GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L 303 pmaddwd mm1, [GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H 304 pmaddwd mm2, [GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L 305 pmaddwd mm7, [GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H 306 307 paddd mm4, MMWORD [wk(0)] ; mm4=data7L 308 paddd mm1, MMWORD [wk(1)] ; mm1=data7H 309 paddd mm2, mm0 ; mm2=data1L 310 paddd mm7, mm6 ; mm7=data1H 311 312 paddd mm4, [GOTOFF(ebx,PD_DESCALE_P1)] 313 paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)] 314 psrad mm4, DESCALE_P1 315 psrad mm1, DESCALE_P1 316 paddd mm2, [GOTOFF(ebx,PD_DESCALE_P1)] 317 paddd mm7, [GOTOFF(ebx,PD_DESCALE_P1)] 318 psrad mm2, DESCALE_P1 319 psrad mm7, DESCALE_P1 320 321 packssdw mm4, mm1 ; mm4=data7 322 packssdw mm2, mm7 ; mm2=data1 323 324 movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4 325 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 326 327 movq mm1, mm3 328 movq mm7, mm3 329 punpcklwd mm1, mm5 330 punpckhwd mm7, mm5 331 movq mm3, mm1 332 movq mm5, mm7 333 pmaddwd mm1, [GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L 334 pmaddwd mm7, [GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H 335 pmaddwd mm3, [GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L 336 pmaddwd mm5, [GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H 337 338 paddd mm1, mm0 ; mm1=data5L 339 paddd mm7, mm6 ; mm7=data5H 340 paddd mm3, MMWORD [wk(0)] ; mm3=data3L 341 paddd mm5, MMWORD [wk(1)] ; mm5=data3H 342 343 paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)] 344 paddd mm7, [GOTOFF(ebx,PD_DESCALE_P1)] 345 psrad mm1, DESCALE_P1 346 psrad mm7, DESCALE_P1 347 paddd mm3, [GOTOFF(ebx,PD_DESCALE_P1)] 348 paddd mm5, [GOTOFF(ebx,PD_DESCALE_P1)] 349 psrad mm3, DESCALE_P1 350 psrad mm5, DESCALE_P1 351 352 packssdw mm1, mm7 ; mm1=data5 353 packssdw mm3, mm5 ; mm3=data3 354 355 movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1 356 movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 357 358 add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM 359 dec ecx 360 jnz near .rowloop 361 362 ; ---- Pass 2: process columns. 363 364 mov edx, POINTER [data(eax)] ; (DCTELEM *) 365 mov ecx, DCTSIZE/4 366 alignx 16, 7 367.columnloop: 368 369 movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] 370 movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] 371 movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] 372 movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] 373 374 ; mm0=(02 12 22 32), mm2=(42 52 62 72) 375 ; mm1=(03 13 23 33), mm3=(43 53 63 73) 376 377 movq mm4, mm0 ; transpose coefficients(phase 1) 378 punpcklwd mm0, mm1 ; mm0=(02 03 12 13) 379 punpckhwd mm4, mm1 ; mm4=(22 23 32 33) 380 movq mm5, mm2 ; transpose coefficients(phase 1) 381 punpcklwd mm2, mm3 ; mm2=(42 43 52 53) 382 punpckhwd mm5, mm3 ; mm5=(62 63 72 73) 383 384 movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] 385 movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] 386 movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] 387 movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] 388 389 ; mm6=(00 10 20 30), mm1=(40 50 60 70) 390 ; mm7=(01 11 21 31), mm3=(41 51 61 71) 391 392 movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) 393 movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) 394 395 movq mm4, mm6 ; transpose coefficients(phase 1) 396 punpcklwd mm6, mm7 ; mm6=(00 01 10 11) 397 punpckhwd mm4, mm7 ; mm4=(20 21 30 31) 398 movq mm2, mm1 ; transpose coefficients(phase 1) 399 punpcklwd mm1, mm3 ; mm1=(40 41 50 51) 400 punpckhwd mm2, mm3 ; mm2=(60 61 70 71) 401 402 movq mm7, mm6 ; transpose coefficients(phase 2) 403 punpckldq mm6, mm0 ; mm6=(00 01 02 03)=data0 404 punpckhdq mm7, mm0 ; mm7=(10 11 12 13)=data1 405 movq mm3, mm2 ; transpose coefficients(phase 2) 406 punpckldq mm2, mm5 ; mm2=(60 61 62 63)=data6 407 punpckhdq mm3, mm5 ; mm3=(70 71 72 73)=data7 408 409 movq mm0, mm7 410 movq mm5, mm6 411 psubw mm7, mm2 ; mm7=data1-data6=tmp6 412 psubw mm6, mm3 ; mm6=data0-data7=tmp7 413 paddw mm0, mm2 ; mm0=data1+data6=tmp1 414 paddw mm5, mm3 ; mm5=data0+data7=tmp0 415 416 movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) 417 movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) 418 movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 419 movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 420 421 movq mm7, mm4 ; transpose coefficients(phase 2) 422 punpckldq mm4, mm2 ; mm4=(20 21 22 23)=data2 423 punpckhdq mm7, mm2 ; mm7=(30 31 32 33)=data3 424 movq mm6, mm1 ; transpose coefficients(phase 2) 425 punpckldq mm1, mm3 ; mm1=(40 41 42 43)=data4 426 punpckhdq mm6, mm3 ; mm6=(50 51 52 53)=data5 427 428 movq mm2, mm7 429 movq mm3, mm4 430 paddw mm7, mm1 ; mm7=data3+data4=tmp3 431 paddw mm4, mm6 ; mm4=data2+data5=tmp2 432 psubw mm2, mm1 ; mm2=data3-data4=tmp4 433 psubw mm3, mm6 ; mm3=data2-data5=tmp5 434 435 ; -- Even part 436 437 movq mm1, mm5 438 movq mm6, mm0 439 paddw mm5, mm7 ; mm5=tmp10 440 paddw mm0, mm4 ; mm0=tmp11 441 psubw mm1, mm7 ; mm1=tmp13 442 psubw mm6, mm4 ; mm6=tmp12 443 444 movq mm7, mm5 445 paddw mm5, mm0 ; mm5=tmp10+tmp11 446 psubw mm7, mm0 ; mm7=tmp10-tmp11 447 448 paddw mm5, [GOTOFF(ebx,PW_DESCALE_P2X)] 449 paddw mm7, [GOTOFF(ebx,PW_DESCALE_P2X)] 450 psraw mm5, PASS1_BITS ; mm5=data0 451 psraw mm7, PASS1_BITS ; mm7=data4 452 453 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 454 movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7 455 456 ; (Original) 457 ; z1 = (tmp12 + tmp13) * 0.541196100; 458 ; data2 = z1 + tmp13 * 0.765366865; 459 ; data6 = z1 + tmp12 * -1.847759065; 460 ; 461 ; (This implementation) 462 ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; 463 ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); 464 465 movq mm4, mm1 ; mm1=tmp13 466 movq mm0, mm1 467 punpcklwd mm4, mm6 ; mm6=tmp12 468 punpckhwd mm0, mm6 469 movq mm1, mm4 470 movq mm6, mm0 471 pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L 472 pmaddwd mm0, [GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H 473 pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L 474 pmaddwd mm6, [GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H 475 476 paddd mm4, [GOTOFF(ebx,PD_DESCALE_P2)] 477 paddd mm0, [GOTOFF(ebx,PD_DESCALE_P2)] 478 psrad mm4, DESCALE_P2 479 psrad mm0, DESCALE_P2 480 paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)] 481 paddd mm6, [GOTOFF(ebx,PD_DESCALE_P2)] 482 psrad mm1, DESCALE_P2 483 psrad mm6, DESCALE_P2 484 485 packssdw mm4, mm0 ; mm4=data2 486 packssdw mm1, mm6 ; mm1=data6 487 488 movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 489 movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1 490 491 ; -- Odd part 492 493 movq mm5, MMWORD [wk(0)] ; mm5=tmp6 494 movq mm7, MMWORD [wk(1)] ; mm7=tmp7 495 496 movq mm0, mm2 ; mm2=tmp4 497 movq mm6, mm3 ; mm3=tmp5 498 paddw mm0, mm5 ; mm0=z3 499 paddw mm6, mm7 ; mm6=z4 500 501 ; (Original) 502 ; z5 = (z3 + z4) * 1.175875602; 503 ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; 504 ; z3 += z5; z4 += z5; 505 ; 506 ; (This implementation) 507 ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; 508 ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); 509 510 movq mm4, mm0 511 movq mm1, mm0 512 punpcklwd mm4, mm6 513 punpckhwd mm1, mm6 514 movq mm0, mm4 515 movq mm6, mm1 516 pmaddwd mm4, [GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L 517 pmaddwd mm1, [GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H 518 pmaddwd mm0, [GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L 519 pmaddwd mm6, [GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H 520 521 movq MMWORD [wk(0)], mm4 ; wk(0)=z3L 522 movq MMWORD [wk(1)], mm1 ; wk(1)=z3H 523 524 ; (Original) 525 ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; 526 ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; 527 ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; 528 ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; 529 ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; 530 ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; 531 ; 532 ; (This implementation) 533 ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; 534 ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; 535 ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); 536 ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); 537 ; data7 = tmp4 + z3; data5 = tmp5 + z4; 538 ; data3 = tmp6 + z3; data1 = tmp7 + z4; 539 540 movq mm4, mm2 541 movq mm1, mm2 542 punpcklwd mm4, mm7 543 punpckhwd mm1, mm7 544 movq mm2, mm4 545 movq mm7, mm1 546 pmaddwd mm4, [GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L 547 pmaddwd mm1, [GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H 548 pmaddwd mm2, [GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L 549 pmaddwd mm7, [GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H 550 551 paddd mm4, MMWORD [wk(0)] ; mm4=data7L 552 paddd mm1, MMWORD [wk(1)] ; mm1=data7H 553 paddd mm2, mm0 ; mm2=data1L 554 paddd mm7, mm6 ; mm7=data1H 555 556 paddd mm4, [GOTOFF(ebx,PD_DESCALE_P2)] 557 paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)] 558 psrad mm4, DESCALE_P2 559 psrad mm1, DESCALE_P2 560 paddd mm2, [GOTOFF(ebx,PD_DESCALE_P2)] 561 paddd mm7, [GOTOFF(ebx,PD_DESCALE_P2)] 562 psrad mm2, DESCALE_P2 563 psrad mm7, DESCALE_P2 564 565 packssdw mm4, mm1 ; mm4=data7 566 packssdw mm2, mm7 ; mm2=data1 567 568 movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4 569 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 570 571 movq mm1, mm3 572 movq mm7, mm3 573 punpcklwd mm1, mm5 574 punpckhwd mm7, mm5 575 movq mm3, mm1 576 movq mm5, mm7 577 pmaddwd mm1, [GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L 578 pmaddwd mm7, [GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H 579 pmaddwd mm3, [GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L 580 pmaddwd mm5, [GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H 581 582 paddd mm1, mm0 ; mm1=data5L 583 paddd mm7, mm6 ; mm7=data5H 584 paddd mm3, MMWORD [wk(0)] ; mm3=data3L 585 paddd mm5, MMWORD [wk(1)] ; mm5=data3H 586 587 paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)] 588 paddd mm7, [GOTOFF(ebx,PD_DESCALE_P2)] 589 psrad mm1, DESCALE_P2 590 psrad mm7, DESCALE_P2 591 paddd mm3, [GOTOFF(ebx,PD_DESCALE_P2)] 592 paddd mm5, [GOTOFF(ebx,PD_DESCALE_P2)] 593 psrad mm3, DESCALE_P2 594 psrad mm5, DESCALE_P2 595 596 packssdw mm1, mm7 ; mm1=data5 597 packssdw mm3, mm5 ; mm3=data3 598 599 movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1 600 movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 601 602 add edx, byte 4*SIZEOF_DCTELEM 603 dec ecx 604 jnz near .columnloop 605 606 emms ; empty MMX state 607 608; pop edi ; unused 609; pop esi ; unused 610; pop edx ; need not be preserved 611; pop ecx ; need not be preserved 612 poppic ebx 613 mov esp, ebp ; esp <- aligned ebp 614 pop esp ; esp <- original ebp 615 pop ebp 616 ret 617 618; For some reason, the OS X linker does not honor the request to align the 619; segment unless we do this. 620 align 32 621