1; 2; jfdctfst.asm - fast integer FDCT (MMX) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; 6; Based on the x86 SIMD extension for IJG JPEG library 7; Copyright (C) 1999-2006, MIYASAKA Masaru. 8; For conditions of distribution and use, see copyright notice in jsimdext.inc 9; 10; This file should be assembled with NASM (Netwide Assembler), 11; can *not* be assembled with Microsoft's MASM or any compatible 12; assembler (including Borland's Turbo Assembler). 13; NASM is available from http://nasm.sourceforge.net/ or 14; http://sourceforge.net/project/showfiles.php?group_id=6208 15; 16; This file contains a fast, not so accurate integer implementation of 17; the forward DCT (Discrete Cosine Transform). The following code is 18; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c 19; for more details. 20; 21; [TAB8] 22 23%include "jsimdext.inc" 24%include "jdct.inc" 25 26; -------------------------------------------------------------------------- 27 28%define CONST_BITS 8 ; 14 is also OK. 29 30%if CONST_BITS == 8 31F_0_382 equ 98 ; FIX(0.382683433) 32F_0_541 equ 139 ; FIX(0.541196100) 33F_0_707 equ 181 ; FIX(0.707106781) 34F_1_306 equ 334 ; FIX(1.306562965) 35%else 36; NASM cannot do compile-time arithmetic on floating-point constants. 37%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) 38F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) 39F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) 40F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) 41F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) 42%endif 43 44; -------------------------------------------------------------------------- 45 SECTION SEG_CONST 46 47; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) 48; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) 49 50%define PRE_MULTIPLY_SCALE_BITS 2 51%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) 52 53 alignz 16 54 global EXTN(jconst_fdct_ifast_mmx) 55 56EXTN(jconst_fdct_ifast_mmx): 57 58PW_F0707 times 4 dw F_0_707 << CONST_SHIFT 59PW_F0382 times 4 dw F_0_382 << CONST_SHIFT 60PW_F0541 times 4 dw F_0_541 << CONST_SHIFT 61PW_F1306 times 4 dw F_1_306 << CONST_SHIFT 62 63 alignz 16 64 65; -------------------------------------------------------------------------- 66 SECTION SEG_TEXT 67 BITS 32 68; 69; Perform the forward DCT on one block of samples. 70; 71; GLOBAL(void) 72; jsimd_fdct_ifast_mmx (DCTELEM *data) 73; 74 75%define data(b) (b)+8 ; DCTELEM *data 76 77%define original_ebp ebp+0 78%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] 79%define WK_NUM 2 80 81 align 16 82 global EXTN(jsimd_fdct_ifast_mmx) 83 84EXTN(jsimd_fdct_ifast_mmx): 85 push ebp 86 mov eax,esp ; eax = original ebp 87 sub esp, byte 4 88 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 89 mov [esp],eax 90 mov ebp,esp ; ebp = aligned ebp 91 lea esp, [wk(0)] 92 pushpic ebx 93; push ecx ; need not be preserved 94; push edx ; need not be preserved 95; push esi ; unused 96; push edi ; unused 97 98 get_GOT ebx ; get GOT address 99 100 ; ---- Pass 1: process rows. 101 102 mov edx, POINTER [data(eax)] ; (DCTELEM *) 103 mov ecx, DCTSIZE/4 104 alignx 16,7 105.rowloop: 106 107 movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] 108 movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] 109 movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] 110 movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] 111 112 ; mm0=(20 21 22 23), mm2=(24 25 26 27) 113 ; mm1=(30 31 32 33), mm3=(34 35 36 37) 114 115 movq mm4,mm0 ; transpose coefficients(phase 1) 116 punpcklwd mm0,mm1 ; mm0=(20 30 21 31) 117 punpckhwd mm4,mm1 ; mm4=(22 32 23 33) 118 movq mm5,mm2 ; transpose coefficients(phase 1) 119 punpcklwd mm2,mm3 ; mm2=(24 34 25 35) 120 punpckhwd mm5,mm3 ; mm5=(26 36 27 37) 121 122 movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] 123 movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] 124 movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] 125 movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] 126 127 ; mm6=(00 01 02 03), mm1=(04 05 06 07) 128 ; mm7=(10 11 12 13), mm3=(14 15 16 17) 129 130 movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) 131 movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) 132 133 movq mm4,mm6 ; transpose coefficients(phase 1) 134 punpcklwd mm6,mm7 ; mm6=(00 10 01 11) 135 punpckhwd mm4,mm7 ; mm4=(02 12 03 13) 136 movq mm2,mm1 ; transpose coefficients(phase 1) 137 punpcklwd mm1,mm3 ; mm1=(04 14 05 15) 138 punpckhwd mm2,mm3 ; mm2=(06 16 07 17) 139 140 movq mm7,mm6 ; transpose coefficients(phase 2) 141 punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0 142 punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1 143 movq mm3,mm2 ; transpose coefficients(phase 2) 144 punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6 145 punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7 146 147 movq mm0,mm7 148 movq mm5,mm6 149 psubw mm7,mm2 ; mm7=data1-data6=tmp6 150 psubw mm6,mm3 ; mm6=data0-data7=tmp7 151 paddw mm0,mm2 ; mm0=data1+data6=tmp1 152 paddw mm5,mm3 ; mm5=data0+data7=tmp0 153 154 movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) 155 movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) 156 movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 157 movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 158 159 movq mm7,mm4 ; transpose coefficients(phase 2) 160 punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2 161 punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3 162 movq mm6,mm1 ; transpose coefficients(phase 2) 163 punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4 164 punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5 165 166 movq mm2,mm7 167 movq mm3,mm4 168 paddw mm7,mm1 ; mm7=data3+data4=tmp3 169 paddw mm4,mm6 ; mm4=data2+data5=tmp2 170 psubw mm2,mm1 ; mm2=data3-data4=tmp4 171 psubw mm3,mm6 ; mm3=data2-data5=tmp5 172 173 ; -- Even part 174 175 movq mm1,mm5 176 movq mm6,mm0 177 psubw mm5,mm7 ; mm5=tmp13 178 psubw mm0,mm4 ; mm0=tmp12 179 paddw mm1,mm7 ; mm1=tmp10 180 paddw mm6,mm4 ; mm6=tmp11 181 182 paddw mm0,mm5 183 psllw mm0,PRE_MULTIPLY_SCALE_BITS 184 pmulhw mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1 185 186 movq mm7,mm1 187 movq mm4,mm5 188 psubw mm1,mm6 ; mm1=data4 189 psubw mm5,mm0 ; mm5=data6 190 paddw mm7,mm6 ; mm7=data0 191 paddw mm4,mm0 ; mm4=data2 192 193 movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1 194 movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5 195 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7 196 movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 197 198 ; -- Odd part 199 200 movq mm6, MMWORD [wk(0)] ; mm6=tmp6 201 movq mm0, MMWORD [wk(1)] ; mm0=tmp7 202 203 paddw mm2,mm3 ; mm2=tmp10 204 paddw mm3,mm6 ; mm3=tmp11 205 paddw mm6,mm0 ; mm6=tmp12, mm0=tmp7 206 207 psllw mm2,PRE_MULTIPLY_SCALE_BITS 208 psllw mm6,PRE_MULTIPLY_SCALE_BITS 209 210 psllw mm3,PRE_MULTIPLY_SCALE_BITS 211 pmulhw mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3 212 213 movq mm1,mm2 ; mm1=tmp10 214 psubw mm2,mm6 215 pmulhw mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5 216 pmulhw mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610) 217 pmulhw mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296) 218 paddw mm1,mm2 ; mm1=z2 219 paddw mm6,mm2 ; mm6=z4 220 221 movq mm5,mm0 222 psubw mm0,mm3 ; mm0=z13 223 paddw mm5,mm3 ; mm5=z11 224 225 movq mm7,mm0 226 movq mm4,mm5 227 psubw mm0,mm1 ; mm0=data3 228 psubw mm5,mm6 ; mm5=data7 229 paddw mm7,mm1 ; mm7=data5 230 paddw mm4,mm6 ; mm4=data1 231 232 movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0 233 movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5 234 movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7 235 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4 236 237 add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM 238 dec ecx 239 jnz near .rowloop 240 241 ; ---- Pass 2: process columns. 242 243 mov edx, POINTER [data(eax)] ; (DCTELEM *) 244 mov ecx, DCTSIZE/4 245 alignx 16,7 246.columnloop: 247 248 movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] 249 movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] 250 movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] 251 movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] 252 253 ; mm0=(02 12 22 32), mm2=(42 52 62 72) 254 ; mm1=(03 13 23 33), mm3=(43 53 63 73) 255 256 movq mm4,mm0 ; transpose coefficients(phase 1) 257 punpcklwd mm0,mm1 ; mm0=(02 03 12 13) 258 punpckhwd mm4,mm1 ; mm4=(22 23 32 33) 259 movq mm5,mm2 ; transpose coefficients(phase 1) 260 punpcklwd mm2,mm3 ; mm2=(42 43 52 53) 261 punpckhwd mm5,mm3 ; mm5=(62 63 72 73) 262 263 movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] 264 movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] 265 movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] 266 movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] 267 268 ; mm6=(00 10 20 30), mm1=(40 50 60 70) 269 ; mm7=(01 11 21 31), mm3=(41 51 61 71) 270 271 movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) 272 movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) 273 274 movq mm4,mm6 ; transpose coefficients(phase 1) 275 punpcklwd mm6,mm7 ; mm6=(00 01 10 11) 276 punpckhwd mm4,mm7 ; mm4=(20 21 30 31) 277 movq mm2,mm1 ; transpose coefficients(phase 1) 278 punpcklwd mm1,mm3 ; mm1=(40 41 50 51) 279 punpckhwd mm2,mm3 ; mm2=(60 61 70 71) 280 281 movq mm7,mm6 ; transpose coefficients(phase 2) 282 punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0 283 punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1 284 movq mm3,mm2 ; transpose coefficients(phase 2) 285 punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6 286 punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7 287 288 movq mm0,mm7 289 movq mm5,mm6 290 psubw mm7,mm2 ; mm7=data1-data6=tmp6 291 psubw mm6,mm3 ; mm6=data0-data7=tmp7 292 paddw mm0,mm2 ; mm0=data1+data6=tmp1 293 paddw mm5,mm3 ; mm5=data0+data7=tmp0 294 295 movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) 296 movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) 297 movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 298 movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 299 300 movq mm7,mm4 ; transpose coefficients(phase 2) 301 punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2 302 punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3 303 movq mm6,mm1 ; transpose coefficients(phase 2) 304 punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4 305 punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5 306 307 movq mm2,mm7 308 movq mm3,mm4 309 paddw mm7,mm1 ; mm7=data3+data4=tmp3 310 paddw mm4,mm6 ; mm4=data2+data5=tmp2 311 psubw mm2,mm1 ; mm2=data3-data4=tmp4 312 psubw mm3,mm6 ; mm3=data2-data5=tmp5 313 314 ; -- Even part 315 316 movq mm1,mm5 317 movq mm6,mm0 318 psubw mm5,mm7 ; mm5=tmp13 319 psubw mm0,mm4 ; mm0=tmp12 320 paddw mm1,mm7 ; mm1=tmp10 321 paddw mm6,mm4 ; mm6=tmp11 322 323 paddw mm0,mm5 324 psllw mm0,PRE_MULTIPLY_SCALE_BITS 325 pmulhw mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1 326 327 movq mm7,mm1 328 movq mm4,mm5 329 psubw mm1,mm6 ; mm1=data4 330 psubw mm5,mm0 ; mm5=data6 331 paddw mm7,mm6 ; mm7=data0 332 paddw mm4,mm0 ; mm4=data2 333 334 movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1 335 movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5 336 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7 337 movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 338 339 ; -- Odd part 340 341 movq mm6, MMWORD [wk(0)] ; mm6=tmp6 342 movq mm0, MMWORD [wk(1)] ; mm0=tmp7 343 344 paddw mm2,mm3 ; mm2=tmp10 345 paddw mm3,mm6 ; mm3=tmp11 346 paddw mm6,mm0 ; mm6=tmp12, mm0=tmp7 347 348 psllw mm2,PRE_MULTIPLY_SCALE_BITS 349 psllw mm6,PRE_MULTIPLY_SCALE_BITS 350 351 psllw mm3,PRE_MULTIPLY_SCALE_BITS 352 pmulhw mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3 353 354 movq mm1,mm2 ; mm1=tmp10 355 psubw mm2,mm6 356 pmulhw mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5 357 pmulhw mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610) 358 pmulhw mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296) 359 paddw mm1,mm2 ; mm1=z2 360 paddw mm6,mm2 ; mm6=z4 361 362 movq mm5,mm0 363 psubw mm0,mm3 ; mm0=z13 364 paddw mm5,mm3 ; mm5=z11 365 366 movq mm7,mm0 367 movq mm4,mm5 368 psubw mm0,mm1 ; mm0=data3 369 psubw mm5,mm6 ; mm5=data7 370 paddw mm7,mm1 ; mm7=data5 371 paddw mm4,mm6 ; mm4=data1 372 373 movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0 374 movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5 375 movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7 376 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4 377 378 add edx, byte 4*SIZEOF_DCTELEM 379 dec ecx 380 jnz near .columnloop 381 382 emms ; empty MMX state 383 384; pop edi ; unused 385; pop esi ; unused 386; pop edx ; need not be preserved 387; pop ecx ; need not be preserved 388 poppic ebx 389 mov esp,ebp ; esp <- aligned ebp 390 pop esp ; esp <- original ebp 391 pop ebp 392 ret 393 394; For some reason, the OS X linker does not honor the request to align the 395; segment unless we do this. 396 align 16 397