1; 2; jidctflt.asm - floating-point IDCT (3DNow! & MMX) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; This file contains a floating-point implementation of the inverse DCT 18; (Discrete Cosine Transform). The following code is based directly on 19; the IJG's original jidctflt.c; see the jidctflt.c for more details. 20; 21; [TAB8] 22 23%include "jsimdext.inc" 24%include "jdct.inc" 25 26; -------------------------------------------------------------------------- 27 SECTION SEG_CONST 28 29 alignz 32 30 GLOBAL_DATA(jconst_idct_float_3dnow) 31 32EXTN(jconst_idct_float_3dnow): 33 34PD_1_414 times 2 dd 1.414213562373095048801689 35PD_1_847 times 2 dd 1.847759065022573512256366 36PD_1_082 times 2 dd 1.082392200292393968799446 37PD_2_613 times 2 dd 2.613125929752753055713286 38PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3) 39PB_CENTERJSAMP times 8 db CENTERJSAMPLE 40 41 alignz 32 42 43; -------------------------------------------------------------------------- 44 SECTION SEG_TEXT 45 BITS 32 46; 47; Perform dequantization and inverse DCT on one block of coefficients. 48; 49; GLOBAL(void) 50; jsimd_idct_float_3dnow(void *dct_table, JCOEFPTR coef_block, 51; JSAMPARRAY output_buf, JDIMENSION output_col) 52; 53 54%define dct_table(b) (b) + 8 ; void *dct_table 55%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block 56%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf 57%define output_col(b) (b) + 20 ; JDIMENSION output_col 58 59%define original_ebp ebp + 0 60%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD 61 ; mmword wk[WK_NUM] 62%define WK_NUM 2 63%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT 64 ; FAST_FLOAT workspace[DCTSIZE2] 65 66 align 32 67 GLOBAL_FUNCTION(jsimd_idct_float_3dnow) 68 69EXTN(jsimd_idct_float_3dnow): 70 push ebp 71 mov eax, esp ; eax = original ebp 72 sub esp, byte 4 73 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 74 mov [esp], eax 75 mov ebp, esp ; ebp = aligned ebp 76 lea esp, [workspace] 77 push ebx 78; push ecx ; need not be preserved 79; push edx ; need not be preserved 80 push esi 81 push edi 82 83 get_GOT ebx ; get GOT address 84 85 ; ---- Pass 1: process columns from input, store into work array. 86 87; mov eax, [original_ebp] 88 mov edx, POINTER [dct_table(eax)] ; quantptr 89 mov esi, JCOEFPTR [coef_block(eax)] ; inptr 90 lea edi, [workspace] ; FAST_FLOAT *wsptr 91 mov ecx, DCTSIZE/2 ; ctr 92 alignx 16, 7 93.columnloop: 94%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW 95 mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 96 or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 97 jnz short .columnDCT 98 99 pushpic ebx ; save GOT address 100 mov ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] 101 mov eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] 102 or ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] 103 or eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] 104 or ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] 105 or eax, ebx 106 poppic ebx ; restore GOT address 107 jnz short .columnDCT 108 109 ; -- AC terms all zero 110 111 movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] 112 113 punpcklwd mm0, mm0 114 psrad mm0, (DWORD_BIT-WORD_BIT) 115 pi2fd mm0, mm0 116 117 pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 118 119 movq mm1, mm0 120 punpckldq mm0, mm0 121 punpckhdq mm1, mm1 122 123 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0 124 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0 125 movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0 126 movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 127 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 128 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1 129 movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1 130 movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 131 jmp near .nextcolumn 132 alignx 16, 7 133%endif 134.columnDCT: 135 136 ; -- Even part 137 138 movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] 139 movd mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 140 movd mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] 141 movd mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] 142 143 punpcklwd mm0, mm0 144 punpcklwd mm1, mm1 145 psrad mm0, (DWORD_BIT-WORD_BIT) 146 psrad mm1, (DWORD_BIT-WORD_BIT) 147 pi2fd mm0, mm0 148 pi2fd mm1, mm1 149 150 pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 151 pfmul mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 152 153 punpcklwd mm2, mm2 154 punpcklwd mm3, mm3 155 psrad mm2, (DWORD_BIT-WORD_BIT) 156 psrad mm3, (DWORD_BIT-WORD_BIT) 157 pi2fd mm2, mm2 158 pi2fd mm3, mm3 159 160 pfmul mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 161 pfmul mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 162 163 movq mm4, mm0 164 movq mm5, mm1 165 pfsub mm0, mm2 ; mm0=tmp11 166 pfsub mm1, mm3 167 pfadd mm4, mm2 ; mm4=tmp10 168 pfadd mm5, mm3 ; mm5=tmp13 169 170 pfmul mm1, [GOTOFF(ebx,PD_1_414)] 171 pfsub mm1, mm5 ; mm1=tmp12 172 173 movq mm6, mm4 174 movq mm7, mm0 175 pfsub mm4, mm5 ; mm4=tmp3 176 pfsub mm0, mm1 ; mm0=tmp2 177 pfadd mm6, mm5 ; mm6=tmp0 178 pfadd mm7, mm1 ; mm7=tmp1 179 180 movq MMWORD [wk(1)], mm4 ; tmp3 181 movq MMWORD [wk(0)], mm0 ; tmp2 182 183 ; -- Odd part 184 185 movd mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 186 movd mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] 187 movd mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] 188 movd mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] 189 190 punpcklwd mm2, mm2 191 punpcklwd mm3, mm3 192 psrad mm2, (DWORD_BIT-WORD_BIT) 193 psrad mm3, (DWORD_BIT-WORD_BIT) 194 pi2fd mm2, mm2 195 pi2fd mm3, mm3 196 197 pfmul mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 198 pfmul mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 199 200 punpcklwd mm5, mm5 201 punpcklwd mm1, mm1 202 psrad mm5, (DWORD_BIT-WORD_BIT) 203 psrad mm1, (DWORD_BIT-WORD_BIT) 204 pi2fd mm5, mm5 205 pi2fd mm1, mm1 206 207 pfmul mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 208 pfmul mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 209 210 movq mm4, mm2 211 movq mm0, mm5 212 pfadd mm2, mm1 ; mm2=z11 213 pfadd mm5, mm3 ; mm5=z13 214 pfsub mm4, mm1 ; mm4=z12 215 pfsub mm0, mm3 ; mm0=z10 216 217 movq mm1, mm2 218 pfsub mm2, mm5 219 pfadd mm1, mm5 ; mm1=tmp7 220 221 pfmul mm2, [GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 222 223 movq mm3, mm0 224 pfadd mm0, mm4 225 pfmul mm0, [GOTOFF(ebx,PD_1_847)] ; mm0=z5 226 pfmul mm3, [GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) 227 pfmul mm4, [GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) 228 pfsubr mm3, mm0 ; mm3=tmp12 229 pfsub mm4, mm0 ; mm4=tmp10 230 231 ; -- Final output stage 232 233 pfsub mm3, mm1 ; mm3=tmp6 234 movq mm5, mm6 235 movq mm0, mm7 236 pfadd mm6, mm1 ; mm6=data0=(00 01) 237 pfadd mm7, mm3 ; mm7=data1=(10 11) 238 pfsub mm5, mm1 ; mm5=data7=(70 71) 239 pfsub mm0, mm3 ; mm0=data6=(60 61) 240 pfsub mm2, mm3 ; mm2=tmp5 241 242 movq mm1, mm6 ; transpose coefficients 243 punpckldq mm6, mm7 ; mm6=(00 10) 244 punpckhdq mm1, mm7 ; mm1=(01 11) 245 movq mm3, mm0 ; transpose coefficients 246 punpckldq mm0, mm5 ; mm0=(60 70) 247 punpckhdq mm3, mm5 ; mm3=(61 71) 248 249 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6 250 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 251 movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 252 movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3 253 254 movq mm7, MMWORD [wk(0)] ; mm7=tmp2 255 movq mm5, MMWORD [wk(1)] ; mm5=tmp3 256 257 pfadd mm4, mm2 ; mm4=tmp4 258 movq mm6, mm7 259 movq mm1, mm5 260 pfadd mm7, mm2 ; mm7=data2=(20 21) 261 pfadd mm5, mm4 ; mm5=data4=(40 41) 262 pfsub mm6, mm2 ; mm6=data5=(50 51) 263 pfsub mm1, mm4 ; mm1=data3=(30 31) 264 265 movq mm0, mm7 ; transpose coefficients 266 punpckldq mm7, mm1 ; mm7=(20 30) 267 punpckhdq mm0, mm1 ; mm0=(21 31) 268 movq mm3, mm5 ; transpose coefficients 269 punpckldq mm5, mm6 ; mm5=(40 50) 270 punpckhdq mm3, mm6 ; mm3=(41 51) 271 272 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7 273 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0 274 movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 275 movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3 276 277.nextcolumn: 278 add esi, byte 2*SIZEOF_JCOEF ; coef_block 279 add edx, byte 2*SIZEOF_FLOAT_MULT_TYPE ; quantptr 280 add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr 281 dec ecx ; ctr 282 jnz near .columnloop 283 284 ; -- Prefetch the next coefficient block 285 286 prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] 287 prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] 288 prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] 289 prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] 290 291 ; ---- Pass 2: process rows from work array, store into output array. 292 293 mov eax, [original_ebp] 294 lea esi, [workspace] ; FAST_FLOAT *wsptr 295 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) 296 mov eax, JDIMENSION [output_col(eax)] 297 mov ecx, DCTSIZE/2 ; ctr 298 alignx 16, 7 299.rowloop: 300 301 ; -- Even part 302 303 movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 304 movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] 305 movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] 306 movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] 307 308 movq mm4, mm0 309 movq mm5, mm1 310 pfsub mm0, mm2 ; mm0=tmp11 311 pfsub mm1, mm3 312 pfadd mm4, mm2 ; mm4=tmp10 313 pfadd mm5, mm3 ; mm5=tmp13 314 315 pfmul mm1, [GOTOFF(ebx,PD_1_414)] 316 pfsub mm1, mm5 ; mm1=tmp12 317 318 movq mm6, mm4 319 movq mm7, mm0 320 pfsub mm4, mm5 ; mm4=tmp3 321 pfsub mm0, mm1 ; mm0=tmp2 322 pfadd mm6, mm5 ; mm6=tmp0 323 pfadd mm7, mm1 ; mm7=tmp1 324 325 movq MMWORD [wk(1)], mm4 ; tmp3 326 movq MMWORD [wk(0)], mm0 ; tmp2 327 328 ; -- Odd part 329 330 movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 331 movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] 332 movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] 333 movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] 334 335 movq mm4, mm2 336 movq mm0, mm5 337 pfadd mm2, mm1 ; mm2=z11 338 pfadd mm5, mm3 ; mm5=z13 339 pfsub mm4, mm1 ; mm4=z12 340 pfsub mm0, mm3 ; mm0=z10 341 342 movq mm1, mm2 343 pfsub mm2, mm5 344 pfadd mm1, mm5 ; mm1=tmp7 345 346 pfmul mm2, [GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 347 348 movq mm3, mm0 349 pfadd mm0, mm4 350 pfmul mm0, [GOTOFF(ebx,PD_1_847)] ; mm0=z5 351 pfmul mm3, [GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) 352 pfmul mm4, [GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) 353 pfsubr mm3, mm0 ; mm3=tmp12 354 pfsub mm4, mm0 ; mm4=tmp10 355 356 ; -- Final output stage 357 358 pfsub mm3, mm1 ; mm3=tmp6 359 movq mm5, mm6 360 movq mm0, mm7 361 pfadd mm6, mm1 ; mm6=data0=(00 10) 362 pfadd mm7, mm3 ; mm7=data1=(01 11) 363 pfsub mm5, mm1 ; mm5=data7=(07 17) 364 pfsub mm0, mm3 ; mm0=data6=(06 16) 365 pfsub mm2, mm3 ; mm2=tmp5 366 367 movq mm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm1=[PD_RNDINT_MAGIC] 368 pcmpeqd mm3, mm3 369 psrld mm3, WORD_BIT ; mm3={0xFFFF 0x0000 0xFFFF 0x0000} 370 371 pfadd mm6, mm1 ; mm6=roundint(data0/8)=(00 ** 10 **) 372 pfadd mm7, mm1 ; mm7=roundint(data1/8)=(01 ** 11 **) 373 pfadd mm0, mm1 ; mm0=roundint(data6/8)=(06 ** 16 **) 374 pfadd mm5, mm1 ; mm5=roundint(data7/8)=(07 ** 17 **) 375 376 pand mm6, mm3 ; mm6=(00 -- 10 --) 377 pslld mm7, WORD_BIT ; mm7=(-- 01 -- 11) 378 pand mm0, mm3 ; mm0=(06 -- 16 --) 379 pslld mm5, WORD_BIT ; mm5=(-- 07 -- 17) 380 por mm6, mm7 ; mm6=(00 01 10 11) 381 por mm0, mm5 ; mm0=(06 07 16 17) 382 383 movq mm1, MMWORD [wk(0)] ; mm1=tmp2 384 movq mm3, MMWORD [wk(1)] ; mm3=tmp3 385 386 pfadd mm4, mm2 ; mm4=tmp4 387 movq mm7, mm1 388 movq mm5, mm3 389 pfadd mm1, mm2 ; mm1=data2=(02 12) 390 pfadd mm3, mm4 ; mm3=data4=(04 14) 391 pfsub mm7, mm2 ; mm7=data5=(05 15) 392 pfsub mm5, mm4 ; mm5=data3=(03 13) 393 394 movq mm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm2=[PD_RNDINT_MAGIC] 395 pcmpeqd mm4, mm4 396 psrld mm4, WORD_BIT ; mm4={0xFFFF 0x0000 0xFFFF 0x0000} 397 398 pfadd mm3, mm2 ; mm3=roundint(data4/8)=(04 ** 14 **) 399 pfadd mm7, mm2 ; mm7=roundint(data5/8)=(05 ** 15 **) 400 pfadd mm1, mm2 ; mm1=roundint(data2/8)=(02 ** 12 **) 401 pfadd mm5, mm2 ; mm5=roundint(data3/8)=(03 ** 13 **) 402 403 pand mm3, mm4 ; mm3=(04 -- 14 --) 404 pslld mm7, WORD_BIT ; mm7=(-- 05 -- 15) 405 pand mm1, mm4 ; mm1=(02 -- 12 --) 406 pslld mm5, WORD_BIT ; mm5=(-- 03 -- 13) 407 por mm3, mm7 ; mm3=(04 05 14 15) 408 por mm1, mm5 ; mm1=(02 03 12 13) 409 410 movq mm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm2=[PB_CENTERJSAMP] 411 412 packsswb mm6, mm3 ; mm6=(00 01 10 11 04 05 14 15) 413 packsswb mm1, mm0 ; mm1=(02 03 12 13 06 07 16 17) 414 paddb mm6, mm2 415 paddb mm1, mm2 416 417 movq mm4, mm6 ; transpose coefficients(phase 2) 418 punpcklwd mm6, mm1 ; mm6=(00 01 02 03 10 11 12 13) 419 punpckhwd mm4, mm1 ; mm4=(04 05 06 07 14 15 16 17) 420 421 movq mm7, mm6 ; transpose coefficients(phase 3) 422 punpckldq mm6, mm4 ; mm6=(00 01 02 03 04 05 06 07) 423 punpckhdq mm7, mm4 ; mm7=(10 11 12 13 14 15 16 17) 424 425 pushpic ebx ; save GOT address 426 427 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] 428 mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] 429 movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 430 movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 431 432 poppic ebx ; restore GOT address 433 434 add esi, byte 2*SIZEOF_FAST_FLOAT ; wsptr 435 add edi, byte 2*SIZEOF_JSAMPROW 436 dec ecx ; ctr 437 jnz near .rowloop 438 439 femms ; empty MMX/3DNow! state 440 441 pop edi 442 pop esi 443; pop edx ; need not be preserved 444; pop ecx ; need not be preserved 445 pop ebx 446 mov esp, ebp ; esp <- aligned ebp 447 pop esp ; esp <- original ebp 448 pop ebp 449 ret 450 451; For some reason, the OS X linker does not honor the request to align the 452; segment unless we do this. 453 align 32 454