1; 2; jsimdext.inc - common declarations 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2010, 2016, D. R. Commander. 6; Copyright (C) 2018, Matthieu Darbois. 7; 8; Based on the x86 SIMD extension for IJG JPEG library - version 1.02 9; 10; Copyright (C) 1999-2006, MIYASAKA Masaru. 11; 12; This software is provided 'as-is', without any express or implied 13; warranty. In no event will the authors be held liable for any damages 14; arising from the use of this software. 15; 16; Permission is granted to anyone to use this software for any purpose, 17; including commercial applications, and to alter it and redistribute it 18; freely, subject to the following restrictions: 19; 20; 1. The origin of this software must not be misrepresented; you must not 21; claim that you wrote the original software. If you use this software 22; in a product, an acknowledgment in the product documentation would be 23; appreciated but is not required. 24; 2. Altered source versions must be plainly marked as such, and must not be 25; misrepresented as being the original software. 26; 3. This notice may not be removed or altered from any source distribution. 27; 28; [TAB8] 29 30; ========================================================================== 31; System-dependent configurations 32 33%ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)-------- 34; * Microsoft Visual C++ 35; * MinGW (Minimalist GNU for Windows) 36; * CygWin 37; * LCC-Win32 38 39; -- segment definition -- 40; 41%ifdef __YASM_VER__ 42%define SEG_TEXT .text align=32 43%define SEG_CONST .rdata align=32 44%else 45%define SEG_TEXT .text align=32 public use32 class=CODE 46%define SEG_CONST .rdata align=32 public use32 class=CONST 47%endif 48 49%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)-------- 50; * Microsoft Visual C++ 51 52; -- segment definition -- 53; 54%ifdef __YASM_VER__ 55%define SEG_TEXT .text align=32 56%define SEG_CONST .rdata align=32 57%else 58%define SEG_TEXT .text align=32 public use64 class=CODE 59%define SEG_CONST .rdata align=32 public use64 class=CONST 60%endif 61%define EXTN(name) name ; foo() -> foo 62 63%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)---------- 64; * Borland C++ (Win32) 65 66; -- segment definition -- 67; 68%define SEG_TEXT _text align=32 public use32 class=CODE 69%define SEG_CONST _data align=32 public use32 class=DATA 70 71%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ 72; * Linux 73; * *BSD family Unix using elf format 74; * Unix System V, including Solaris x86, UnixWare and SCO Unix 75 76; mark stack as non-executable 77section .note.GNU-stack noalloc noexec nowrite progbits 78 79; -- segment definition -- 80; 81%ifdef __x86_64__ 82%define SEG_TEXT .text progbits align=32 83%define SEG_CONST .rodata progbits align=32 84%else 85%define SEG_TEXT .text progbits alloc exec nowrite align=32 86%define SEG_CONST .rodata progbits alloc noexec nowrite align=32 87%endif 88 89; To make the code position-independent, append -DPIC to the commandline 90; 91%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC 92%define EXTN(name) name ; foo() -> foo 93 94%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)---- 95; * Older Linux using a.out format (nasm -f aout -DAOUT ...) 96; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...) 97 98; -- segment definition -- 99; 100%define SEG_TEXT .text 101%define SEG_CONST .data 102 103; To make the code position-independent, append -DPIC to the commandline 104; 105%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC 106 107%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- 108; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format) 109 110; -- segment definition -- 111; 112%define SEG_TEXT .text ;align=32 ; nasm doesn't accept align=32. why? 113%define SEG_CONST .rodata align=32 114 115; The generation of position-independent code (PIC) is the default on Darwin. 116; 117%define PIC 118%define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing 119 120%else ; ----(Other case)---------------------- 121 122; -- segment definition -- 123; 124%define SEG_TEXT .text 125%define SEG_CONST .data 126 127%endif ; ---------------------------------------------- 128 129; ========================================================================== 130 131; -------------------------------------------------------------------------- 132; Common types 133; 134%ifdef __x86_64__ 135%define POINTER qword ; general pointer type 136%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER) 137%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT 138%else 139%define POINTER dword ; general pointer type 140%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER) 141%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT 142%endif 143 144%define INT dword ; signed integer type 145%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT) 146%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT 147 148%define FP32 dword ; IEEE754 single 149%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32) 150%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT 151 152%define MMWORD qword ; int64 (MMX register) 153%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD) 154%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT 155 156; NASM is buggy and doesn't properly handle operand sizes for SSE 157; instructions, so for now we have to define XMMWORD as blank. 158%define XMMWORD ; int128 (SSE register) 159%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD) 160%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT 161 162%define YMMWORD ; int256 (AVX register) 163%define SIZEOF_YMMWORD SIZEOF_YWORD ; sizeof(YMMWORD) 164%define YMMWORD_BIT YWORD_BIT ; sizeof(YMMWORD)*BYTE_BIT 165 166; Similar hacks for when we load a dword or MMWORD into an xmm# register 167%define XMM_DWORD 168%define XMM_MMWORD 169 170%define SIZEOF_BYTE 1 ; sizeof(BYTE) 171%define SIZEOF_WORD 2 ; sizeof(WORD) 172%define SIZEOF_DWORD 4 ; sizeof(DWORD) 173%define SIZEOF_QWORD 8 ; sizeof(QWORD) 174%define SIZEOF_OWORD 16 ; sizeof(OWORD) 175%define SIZEOF_YWORD 32 ; sizeof(YWORD) 176 177%define BYTE_BIT 8 ; CHAR_BIT in C 178%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT 179%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT 180%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT 181%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT 182%define YWORD_BIT 256 ; sizeof(YWORD)*BYTE_BIT 183 184; -------------------------------------------------------------------------- 185; External Symbol Name 186; 187%ifndef EXTN 188%define EXTN(name) _ %+ name ; foo() -> _foo 189%endif 190 191; -------------------------------------------------------------------------- 192; Hidden symbols 193; 194%ifdef ELF ; ----(nasm -felf[64] -DELF ...)-------- 195%define GLOBAL_FUNCTION(name) global EXTN(name):function hidden 196%define GLOBAL_DATA(name) global EXTN(name):data hidden 197%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- 198%ifdef __YASM_VER__ 199%define GLOBAL_FUNCTION(name) global EXTN(name):private_extern 200%define GLOBAL_DATA(name) global EXTN(name):private_extern 201%endif 202%endif 203 204%ifndef GLOBAL_FUNCTION 205%define GLOBAL_FUNCTION(name) global EXTN(name) 206%endif 207%ifndef GLOBAL_DATA 208%define GLOBAL_DATA(name) global EXTN(name) 209%endif 210 211; -------------------------------------------------------------------------- 212; Macros for position-independent code (PIC) support 213; 214%ifndef GOT_SYMBOL 215%undef PIC 216%endif 217 218%ifdef PIC ; ------------------------------------------- 219 220%ifidn GOT_SYMBOL, _MACHO_PIC_ ; -------------------- 221 222; At present, nasm doesn't seem to support PIC generation for Mach-O. 223; The PIC support code below is a little tricky. 224 225 SECTION SEG_CONST 226const_base: 227 228%define GOTOFF(got, sym) (got) + (sym) - const_base 229 230%imacro get_GOT 1 231 ; NOTE: this macro destroys ecx resister. 232 call %%geteip 233 add ecx, byte (%%ref - $) 234 jmp short %%adjust 235%%geteip: 236 mov ecx, POINTER [esp] 237 ret 238%%adjust: 239 push ebp 240 xor ebp, ebp ; ebp = 0 241%ifidni %1, ebx ; (%1 == ebx) 242 ; db 0x8D,0x9C + jmp near const_base = 243 ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32) 244 db 0x8D, 0x9C ; 8D,9C 245 jmp near const_base ; E9,(const_base-%%ref) 246%%ref: 247%else ; (%1 != ebx) 248 ; db 0x8D,0x8C + jmp near const_base = 249 ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32) 250 db 0x8D, 0x8C ; 8D,8C 251 jmp near const_base ; E9,(const_base-%%ref) 252%%ref: 253 mov %1, ecx 254%endif ; (%1 == ebx) 255 pop ebp 256%endmacro 257 258%else ; GOT_SYMBOL != _MACHO_PIC_ ---------------- 259 260%define GOTOFF(got, sym) (got) + (sym) wrt ..gotoff 261 262%imacro get_GOT 1 263 extern GOT_SYMBOL 264 call %%geteip 265 add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc 266 jmp short %%done 267%%geteip: 268 mov %1, POINTER [esp] 269 ret 270%%done: 271%endmacro 272 273%endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- 274 275%imacro pushpic 1.nolist 276 push %1 277%endmacro 278%imacro poppic 1.nolist 279 pop %1 280%endmacro 281%imacro movpic 2.nolist 282 mov %1, %2 283%endmacro 284 285%else ; !PIC ----------------------------------------- 286 287%define GOTOFF(got, sym) (sym) 288 289%imacro get_GOT 1.nolist 290%endmacro 291%imacro pushpic 1.nolist 292%endmacro 293%imacro poppic 1.nolist 294%endmacro 295%imacro movpic 2.nolist 296%endmacro 297 298%endif ; PIC ----------------------------------------- 299 300; -------------------------------------------------------------------------- 301; Align the next instruction on {2,4,8,16,..}-byte boundary. 302; ".balign n,,m" in GNU as 303; 304%define MSKLE(x, y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16) 305%define FILLB(b, n) (($$-(b)) & ((n)-1)) 306 307%imacro alignx 1-2.nolist 0xFFFF 308%%bs: \ 309 times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \ 310 db 0x90 ; nop 311 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \ 312 db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00 ; lea ebx,[ebx+0x00000000] 313 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \ 314 db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000] 315 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \ 316 db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000] 317 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \ 318 db 0x8D, 0x6C, 0x25, 0x00 ; lea ebp,[ebp+0x00] 319 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \ 320 db 0x8D, 0x6D, 0x00 ; lea ebp,[ebp+0x00] 321 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \ 322 db 0x8B, 0xED ; mov ebp,ebp 323 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \ 324 db 0x90 ; nop 325%endmacro 326 327; Align the next data on {2,4,8,16,..}-byte boundary. 328; 329%imacro alignz 1.nolist 330 align %1, db 0 ; filling zeros 331%endmacro 332 333%ifdef __x86_64__ 334 335%ifdef WIN64 336 337%imacro collect_args 1 338 sub rsp, SIZEOF_XMMWORD 339 movaps XMMWORD [rsp], xmm6 340 sub rsp, SIZEOF_XMMWORD 341 movaps XMMWORD [rsp], xmm7 342 mov r10, rcx 343%if %1 > 1 344 mov r11, rdx 345%endif 346%if %1 > 2 347 push r12 348 mov r12, r8 349%endif 350%if %1 > 3 351 push r13 352 mov r13, r9 353%endif 354%if %1 > 4 355 push r14 356 mov r14, [rax+48] 357%endif 358%if %1 > 5 359 push r15 360 mov r15, [rax+56] 361%endif 362 push rsi 363 push rdi 364%endmacro 365 366%imacro uncollect_args 1 367 pop rdi 368 pop rsi 369%if %1 > 5 370 pop r15 371%endif 372%if %1 > 4 373 pop r14 374%endif 375%if %1 > 3 376 pop r13 377%endif 378%if %1 > 2 379 pop r12 380%endif 381 movaps xmm7, XMMWORD [rsp] 382 add rsp, SIZEOF_XMMWORD 383 movaps xmm6, XMMWORD [rsp] 384 add rsp, SIZEOF_XMMWORD 385%endmacro 386 387%imacro push_xmm 1 388 sub rsp, %1 * SIZEOF_XMMWORD 389 movaps XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8 390%if %1 > 1 391 movaps XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9 392%endif 393%if %1 > 2 394 movaps XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10 395%endif 396%if %1 > 3 397 movaps XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11 398%endif 399%endmacro 400 401%imacro pop_xmm 1 402 movaps xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD] 403%if %1 > 1 404 movaps xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD] 405%endif 406%if %1 > 2 407 movaps xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD] 408%endif 409%if %1 > 3 410 movaps xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD] 411%endif 412 add rsp, %1 * SIZEOF_XMMWORD 413%endmacro 414 415%else 416 417%imacro collect_args 1 418 push r10 419 mov r10, rdi 420%if %1 > 1 421 push r11 422 mov r11, rsi 423%endif 424%if %1 > 2 425 push r12 426 mov r12, rdx 427%endif 428%if %1 > 3 429 push r13 430 mov r13, rcx 431%endif 432%if %1 > 4 433 push r14 434 mov r14, r8 435%endif 436%if %1 > 5 437 push r15 438 mov r15, r9 439%endif 440%endmacro 441 442%imacro uncollect_args 1 443%if %1 > 5 444 pop r15 445%endif 446%if %1 > 4 447 pop r14 448%endif 449%if %1 > 3 450 pop r13 451%endif 452%if %1 > 2 453 pop r12 454%endif 455%if %1 > 1 456 pop r11 457%endif 458 pop r10 459%endmacro 460 461%imacro push_xmm 1 462%endmacro 463 464%imacro pop_xmm 1 465%endmacro 466 467%endif 468 469%endif 470 471; -------------------------------------------------------------------------- 472; Defines picked up from the C headers 473; 474%include "jsimdcfg.inc" 475 476; -------------------------------------------------------------------------- 477