1; 2; jfdctflt.asm - floating-point FDCT (64-bit SSE) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; This file contains a floating-point implementation of the forward DCT 18; (Discrete Cosine Transform). The following code is based directly on 19; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. 20; 21; [TAB8] 22 23%include "jsimdext.inc" 24%include "jdct.inc" 25 26; -------------------------------------------------------------------------- 27 28%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) 29 shufps %1, %2, 0x44 30%endmacro 31 32%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) 33 shufps %1, %2, 0xEE 34%endmacro 35 36; -------------------------------------------------------------------------- 37 SECTION SEG_CONST 38 39 alignz 32 40 GLOBAL_DATA(jconst_fdct_float_sse) 41 42EXTN(jconst_fdct_float_sse): 43 44PD_0_382 times 4 dd 0.382683432365089771728460 45PD_0_707 times 4 dd 0.707106781186547524400844 46PD_0_541 times 4 dd 0.541196100146196984399723 47PD_1_306 times 4 dd 1.306562964876376527856643 48 49 alignz 32 50 51; -------------------------------------------------------------------------- 52 SECTION SEG_TEXT 53 BITS 64 54; 55; Perform the forward DCT on one block of samples. 56; 57; GLOBAL(void) 58; jsimd_fdct_float_sse(FAST_FLOAT *data) 59; 60 61; r10 = FAST_FLOAT *data 62 63%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 64%define WK_NUM 2 65 66 align 32 67 GLOBAL_FUNCTION(jsimd_fdct_float_sse) 68 69EXTN(jsimd_fdct_float_sse): 70 push rbp 71 mov rax, rsp ; rax = original rbp 72 sub rsp, byte 4 73 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 74 mov [rsp], rax 75 mov rbp, rsp ; rbp = aligned rbp 76 lea rsp, [wk(0)] 77 collect_args 1 78 79 ; ---- Pass 1: process rows. 80 81 mov rdx, r10 ; (FAST_FLOAT *) 82 mov rcx, DCTSIZE/4 83.rowloop: 84 85 movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] 86 movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] 87 movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)] 88 movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)] 89 90 ; xmm0=(20 21 22 23), xmm2=(24 25 26 27) 91 ; xmm1=(30 31 32 33), xmm3=(34 35 36 37) 92 93 movaps xmm4, xmm0 ; transpose coefficients(phase 1) 94 unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31) 95 unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33) 96 movaps xmm5, xmm2 ; transpose coefficients(phase 1) 97 unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35) 98 unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37) 99 100 movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] 101 movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] 102 movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] 103 movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] 104 105 ; xmm6=(00 01 02 03), xmm1=(04 05 06 07) 106 ; xmm7=(10 11 12 13), xmm3=(14 15 16 17) 107 108 movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) 109 movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) 110 111 movaps xmm4, xmm6 ; transpose coefficients(phase 1) 112 unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) 113 unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13) 114 movaps xmm2, xmm1 ; transpose coefficients(phase 1) 115 unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15) 116 unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17) 117 118 movaps xmm7, xmm6 ; transpose coefficients(phase 2) 119 unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0 120 unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1 121 movaps xmm3, xmm2 ; transpose coefficients(phase 2) 122 unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6 123 unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7 124 125 movaps xmm0, xmm7 126 movaps xmm5, xmm6 127 subps xmm7, xmm2 ; xmm7=data1-data6=tmp6 128 subps xmm6, xmm3 ; xmm6=data0-data7=tmp7 129 addps xmm0, xmm2 ; xmm0=data1+data6=tmp1 130 addps xmm5, xmm3 ; xmm5=data0+data7=tmp0 131 132 movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) 133 movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) 134 movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 135 movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 136 137 movaps xmm7, xmm4 ; transpose coefficients(phase 2) 138 unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2 139 unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3 140 movaps xmm6, xmm1 ; transpose coefficients(phase 2) 141 unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4 142 unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5 143 144 movaps xmm2, xmm7 145 movaps xmm3, xmm4 146 addps xmm7, xmm1 ; xmm7=data3+data4=tmp3 147 addps xmm4, xmm6 ; xmm4=data2+data5=tmp2 148 subps xmm2, xmm1 ; xmm2=data3-data4=tmp4 149 subps xmm3, xmm6 ; xmm3=data2-data5=tmp5 150 151 ; -- Even part 152 153 movaps xmm1, xmm5 154 movaps xmm6, xmm0 155 subps xmm5, xmm7 ; xmm5=tmp13 156 subps xmm0, xmm4 ; xmm0=tmp12 157 addps xmm1, xmm7 ; xmm1=tmp10 158 addps xmm6, xmm4 ; xmm6=tmp11 159 160 addps xmm0, xmm5 161 mulps xmm0, [rel PD_0_707] ; xmm0=z1 162 163 movaps xmm7, xmm1 164 movaps xmm4, xmm5 165 subps xmm1, xmm6 ; xmm1=data4 166 subps xmm5, xmm0 ; xmm5=data6 167 addps xmm7, xmm6 ; xmm7=data0 168 addps xmm4, xmm0 ; xmm4=data2 169 170 movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1 171 movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 172 movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 173 movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 174 175 ; -- Odd part 176 177 movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 178 movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 179 180 addps xmm2, xmm3 ; xmm2=tmp10 181 addps xmm3, xmm6 ; xmm3=tmp11 182 addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7 183 184 mulps xmm3, [rel PD_0_707] ; xmm3=z3 185 186 movaps xmm1, xmm2 ; xmm1=tmp10 187 subps xmm2, xmm6 188 mulps xmm2, [rel PD_0_382] ; xmm2=z5 189 mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) 190 mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) 191 addps xmm1, xmm2 ; xmm1=z2 192 addps xmm6, xmm2 ; xmm6=z4 193 194 movaps xmm5, xmm0 195 subps xmm0, xmm3 ; xmm0=z13 196 addps xmm5, xmm3 ; xmm5=z11 197 198 movaps xmm7, xmm0 199 movaps xmm4, xmm5 200 subps xmm0, xmm1 ; xmm0=data3 201 subps xmm5, xmm6 ; xmm5=data7 202 addps xmm7, xmm1 ; xmm7=data5 203 addps xmm4, xmm6 ; xmm4=data1 204 205 movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 206 movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 207 movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7 208 movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 209 210 add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT 211 dec rcx 212 jnz near .rowloop 213 214 ; ---- Pass 2: process columns. 215 216 mov rdx, r10 ; (FAST_FLOAT *) 217 mov rcx, DCTSIZE/4 218.columnloop: 219 220 movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] 221 movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] 222 movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)] 223 movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)] 224 225 ; xmm0=(02 12 22 32), xmm2=(42 52 62 72) 226 ; xmm1=(03 13 23 33), xmm3=(43 53 63 73) 227 228 movaps xmm4, xmm0 ; transpose coefficients(phase 1) 229 unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13) 230 unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33) 231 movaps xmm5, xmm2 ; transpose coefficients(phase 1) 232 unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53) 233 unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73) 234 235 movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] 236 movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] 237 movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)] 238 movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)] 239 240 ; xmm6=(00 10 20 30), xmm1=(40 50 60 70) 241 ; xmm7=(01 11 21 31), xmm3=(41 51 61 71) 242 243 movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) 244 movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) 245 246 movaps xmm4, xmm6 ; transpose coefficients(phase 1) 247 unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11) 248 unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31) 249 movaps xmm2, xmm1 ; transpose coefficients(phase 1) 250 unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51) 251 unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71) 252 253 movaps xmm7, xmm6 ; transpose coefficients(phase 2) 254 unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0 255 unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1 256 movaps xmm3, xmm2 ; transpose coefficients(phase 2) 257 unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6 258 unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7 259 260 movaps xmm0, xmm7 261 movaps xmm5, xmm6 262 subps xmm7, xmm2 ; xmm7=data1-data6=tmp6 263 subps xmm6, xmm3 ; xmm6=data0-data7=tmp7 264 addps xmm0, xmm2 ; xmm0=data1+data6=tmp1 265 addps xmm5, xmm3 ; xmm5=data0+data7=tmp0 266 267 movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) 268 movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) 269 movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 270 movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 271 272 movaps xmm7, xmm4 ; transpose coefficients(phase 2) 273 unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2 274 unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3 275 movaps xmm6, xmm1 ; transpose coefficients(phase 2) 276 unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4 277 unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5 278 279 movaps xmm2, xmm7 280 movaps xmm3, xmm4 281 addps xmm7, xmm1 ; xmm7=data3+data4=tmp3 282 addps xmm4, xmm6 ; xmm4=data2+data5=tmp2 283 subps xmm2, xmm1 ; xmm2=data3-data4=tmp4 284 subps xmm3, xmm6 ; xmm3=data2-data5=tmp5 285 286 ; -- Even part 287 288 movaps xmm1, xmm5 289 movaps xmm6, xmm0 290 subps xmm5, xmm7 ; xmm5=tmp13 291 subps xmm0, xmm4 ; xmm0=tmp12 292 addps xmm1, xmm7 ; xmm1=tmp10 293 addps xmm6, xmm4 ; xmm6=tmp11 294 295 addps xmm0, xmm5 296 mulps xmm0, [rel PD_0_707] ; xmm0=z1 297 298 movaps xmm7, xmm1 299 movaps xmm4, xmm5 300 subps xmm1, xmm6 ; xmm1=data4 301 subps xmm5, xmm0 ; xmm5=data6 302 addps xmm7, xmm6 ; xmm7=data0 303 addps xmm4, xmm0 ; xmm4=data2 304 305 movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1 306 movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 307 movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 308 movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 309 310 ; -- Odd part 311 312 movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 313 movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 314 315 addps xmm2, xmm3 ; xmm2=tmp10 316 addps xmm3, xmm6 ; xmm3=tmp11 317 addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7 318 319 mulps xmm3, [rel PD_0_707] ; xmm3=z3 320 321 movaps xmm1, xmm2 ; xmm1=tmp10 322 subps xmm2, xmm6 323 mulps xmm2, [rel PD_0_382] ; xmm2=z5 324 mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) 325 mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) 326 addps xmm1, xmm2 ; xmm1=z2 327 addps xmm6, xmm2 ; xmm6=z4 328 329 movaps xmm5, xmm0 330 subps xmm0, xmm3 ; xmm0=z13 331 addps xmm5, xmm3 ; xmm5=z11 332 333 movaps xmm7, xmm0 334 movaps xmm4, xmm5 335 subps xmm0, xmm1 ; xmm0=data3 336 subps xmm5, xmm6 ; xmm5=data7 337 addps xmm7, xmm1 ; xmm7=data5 338 addps xmm4, xmm6 ; xmm4=data1 339 340 movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 341 movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 342 movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 343 movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 344 345 add rdx, byte 4*SIZEOF_FAST_FLOAT 346 dec rcx 347 jnz near .columnloop 348 349 uncollect_args 1 350 mov rsp, rbp ; rsp <- aligned rbp 351 pop rsp ; rsp <- original rbp 352 pop rbp 353 ret 354 355; For some reason, the OS X linker does not honor the request to align the 356; segment unless we do this. 357 align 32 358