1; 2; jfdctflt.asm - floating-point FDCT (3DNow!) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; 6; Based on the x86 SIMD extension for IJG JPEG library 7; Copyright (C) 1999-2006, MIYASAKA Masaru. 8; For conditions of distribution and use, see copyright notice in jsimdext.inc 9; 10; This file should be assembled with NASM (Netwide Assembler), 11; can *not* be assembled with Microsoft's MASM or any compatible 12; assembler (including Borland's Turbo Assembler). 13; NASM is available from http://nasm.sourceforge.net/ or 14; http://sourceforge.net/project/showfiles.php?group_id=6208 15; 16; This file contains a floating-point implementation of the forward DCT 17; (Discrete Cosine Transform). The following code is based directly on 18; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. 19; 20; [TAB8] 21 22%include "jsimdext.inc" 23%include "jdct.inc" 24 25; -------------------------------------------------------------------------- 26 SECTION SEG_CONST 27 28 alignz 16 29 global EXTN(jconst_fdct_float_3dnow) 30 31EXTN(jconst_fdct_float_3dnow): 32 33PD_0_382 times 2 dd 0.382683432365089771728460 34PD_0_707 times 2 dd 0.707106781186547524400844 35PD_0_541 times 2 dd 0.541196100146196984399723 36PD_1_306 times 2 dd 1.306562964876376527856643 37 38 alignz 16 39 40; -------------------------------------------------------------------------- 41 SECTION SEG_TEXT 42 BITS 32 43; 44; Perform the forward DCT on one block of samples. 45; 46; GLOBAL(void) 47; jsimd_fdct_float_3dnow (FAST_FLOAT *data) 48; 49 50%define data(b) (b)+8 ; FAST_FLOAT *data 51 52%define original_ebp ebp+0 53%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] 54%define WK_NUM 2 55 56 align 16 57 global EXTN(jsimd_fdct_float_3dnow) 58 59EXTN(jsimd_fdct_float_3dnow): 60 push ebp 61 mov eax,esp ; eax = original ebp 62 sub esp, byte 4 63 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 64 mov [esp],eax 65 mov ebp,esp ; ebp = aligned ebp 66 lea esp, [wk(0)] 67 pushpic ebx 68; push ecx ; need not be preserved 69; push edx ; need not be preserved 70; push esi ; unused 71; push edi ; unused 72 73 get_GOT ebx ; get GOT address 74 75 ; ---- Pass 1: process rows. 76 77 mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) 78 mov ecx, DCTSIZE/2 79 alignx 16,7 80.rowloop: 81 82 movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 83 movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 84 movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] 85 movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] 86 87 ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17) 88 89 movq mm4,mm0 ; transpose coefficients 90 punpckldq mm0,mm1 ; mm0=(00 10)=data0 91 punpckhdq mm4,mm1 ; mm4=(01 11)=data1 92 movq mm5,mm2 ; transpose coefficients 93 punpckldq mm2,mm3 ; mm2=(06 16)=data6 94 punpckhdq mm5,mm3 ; mm5=(07 17)=data7 95 96 movq mm6,mm4 97 movq mm7,mm0 98 pfsub mm4,mm2 ; mm4=data1-data6=tmp6 99 pfsub mm0,mm5 ; mm0=data0-data7=tmp7 100 pfadd mm6,mm2 ; mm6=data1+data6=tmp1 101 pfadd mm7,mm5 ; mm7=data0+data7=tmp0 102 103 movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] 104 movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] 105 movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] 106 movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] 107 108 ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15) 109 110 movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 111 movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 112 113 movq mm4,mm1 ; transpose coefficients 114 punpckldq mm1,mm3 ; mm1=(02 12)=data2 115 punpckhdq mm4,mm3 ; mm4=(03 13)=data3 116 movq mm0,mm2 ; transpose coefficients 117 punpckldq mm2,mm5 ; mm2=(04 14)=data4 118 punpckhdq mm0,mm5 ; mm0=(05 15)=data5 119 120 movq mm3,mm4 121 movq mm5,mm1 122 pfadd mm4,mm2 ; mm4=data3+data4=tmp3 123 pfadd mm1,mm0 ; mm1=data2+data5=tmp2 124 pfsub mm3,mm2 ; mm3=data3-data4=tmp4 125 pfsub mm5,mm0 ; mm5=data2-data5=tmp5 126 127 ; -- Even part 128 129 movq mm2,mm7 130 movq mm0,mm6 131 pfsub mm7,mm4 ; mm7=tmp13 132 pfsub mm6,mm1 ; mm6=tmp12 133 pfadd mm2,mm4 ; mm2=tmp10 134 pfadd mm0,mm1 ; mm0=tmp11 135 136 pfadd mm6,mm7 137 pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1 138 139 movq mm4,mm2 140 movq mm1,mm7 141 pfsub mm2,mm0 ; mm2=data4 142 pfsub mm7,mm6 ; mm7=data6 143 pfadd mm4,mm0 ; mm4=data0 144 pfadd mm1,mm6 ; mm1=data2 145 146 movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2 147 movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7 148 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 149 movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1 150 151 ; -- Odd part 152 153 movq mm0, MMWORD [wk(0)] ; mm0=tmp6 154 movq mm6, MMWORD [wk(1)] ; mm6=tmp7 155 156 pfadd mm3,mm5 ; mm3=tmp10 157 pfadd mm5,mm0 ; mm5=tmp11 158 pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7 159 160 pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3 161 162 movq mm2,mm3 ; mm2=tmp10 163 pfsub mm3,mm0 164 pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5 165 pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) 166 pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) 167 pfadd mm2,mm3 ; mm2=z2 168 pfadd mm0,mm3 ; mm0=z4 169 170 movq mm7,mm6 171 pfsub mm6,mm5 ; mm6=z13 172 pfadd mm7,mm5 ; mm7=z11 173 174 movq mm4,mm6 175 movq mm1,mm7 176 pfsub mm6,mm2 ; mm6=data3 177 pfsub mm7,mm0 ; mm7=data7 178 pfadd mm4,mm2 ; mm4=data5 179 pfadd mm1,mm0 ; mm1=data1 180 181 movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6 182 movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7 183 movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4 184 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 185 186 add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT 187 dec ecx 188 jnz near .rowloop 189 190 ; ---- Pass 2: process columns. 191 192 mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) 193 mov ecx, DCTSIZE/2 194 alignx 16,7 195.columnloop: 196 197 movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 198 movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 199 movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] 200 movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] 201 202 ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71) 203 204 movq mm4,mm0 ; transpose coefficients 205 punpckldq mm0,mm1 ; mm0=(00 01)=data0 206 punpckhdq mm4,mm1 ; mm4=(10 11)=data1 207 movq mm5,mm2 ; transpose coefficients 208 punpckldq mm2,mm3 ; mm2=(60 61)=data6 209 punpckhdq mm5,mm3 ; mm5=(70 71)=data7 210 211 movq mm6,mm4 212 movq mm7,mm0 213 pfsub mm4,mm2 ; mm4=data1-data6=tmp6 214 pfsub mm0,mm5 ; mm0=data0-data7=tmp7 215 pfadd mm6,mm2 ; mm6=data1+data6=tmp1 216 pfadd mm7,mm5 ; mm7=data0+data7=tmp0 217 218 movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] 219 movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] 220 movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] 221 movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] 222 223 ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51) 224 225 movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 226 movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 227 228 movq mm4,mm1 ; transpose coefficients 229 punpckldq mm1,mm3 ; mm1=(20 21)=data2 230 punpckhdq mm4,mm3 ; mm4=(30 31)=data3 231 movq mm0,mm2 ; transpose coefficients 232 punpckldq mm2,mm5 ; mm2=(40 41)=data4 233 punpckhdq mm0,mm5 ; mm0=(50 51)=data5 234 235 movq mm3,mm4 236 movq mm5,mm1 237 pfadd mm4,mm2 ; mm4=data3+data4=tmp3 238 pfadd mm1,mm0 ; mm1=data2+data5=tmp2 239 pfsub mm3,mm2 ; mm3=data3-data4=tmp4 240 pfsub mm5,mm0 ; mm5=data2-data5=tmp5 241 242 ; -- Even part 243 244 movq mm2,mm7 245 movq mm0,mm6 246 pfsub mm7,mm4 ; mm7=tmp13 247 pfsub mm6,mm1 ; mm6=tmp12 248 pfadd mm2,mm4 ; mm2=tmp10 249 pfadd mm0,mm1 ; mm0=tmp11 250 251 pfadd mm6,mm7 252 pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1 253 254 movq mm4,mm2 255 movq mm1,mm7 256 pfsub mm2,mm0 ; mm2=data4 257 pfsub mm7,mm6 ; mm7=data6 258 pfadd mm4,mm0 ; mm4=data0 259 pfadd mm1,mm6 ; mm1=data2 260 261 movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2 262 movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7 263 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 264 movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1 265 266 ; -- Odd part 267 268 movq mm0, MMWORD [wk(0)] ; mm0=tmp6 269 movq mm6, MMWORD [wk(1)] ; mm6=tmp7 270 271 pfadd mm3,mm5 ; mm3=tmp10 272 pfadd mm5,mm0 ; mm5=tmp11 273 pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7 274 275 pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3 276 277 movq mm2,mm3 ; mm2=tmp10 278 pfsub mm3,mm0 279 pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5 280 pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) 281 pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) 282 pfadd mm2,mm3 ; mm2=z2 283 pfadd mm0,mm3 ; mm0=z4 284 285 movq mm7,mm6 286 pfsub mm6,mm5 ; mm6=z13 287 pfadd mm7,mm5 ; mm7=z11 288 289 movq mm4,mm6 290 movq mm1,mm7 291 pfsub mm6,mm2 ; mm6=data3 292 pfsub mm7,mm0 ; mm7=data7 293 pfadd mm4,mm2 ; mm4=data5 294 pfadd mm1,mm0 ; mm1=data1 295 296 movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6 297 movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7 298 movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4 299 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 300 301 add edx, byte 2*SIZEOF_FAST_FLOAT 302 dec ecx 303 jnz near .columnloop 304 305 femms ; empty MMX/3DNow! state 306 307; pop edi ; unused 308; pop esi ; unused 309; pop edx ; need not be preserved 310; pop ecx ; need not be preserved 311 poppic ebx 312 mov esp,ebp ; esp <- aligned ebp 313 pop esp ; esp <- original ebp 314 pop ebp 315 ret 316 317; For some reason, the OS X linker does not honor the request to align the 318; segment unless we do this. 319 align 16 320