1; 2; jsimdext.inc - common declarations 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright 2010 D. R. Commander 6; 7; Based on 8; x86 SIMD extension for IJG JPEG library - version 1.02 9; 10; Copyright (C) 1999-2006, MIYASAKA Masaru. 11; 12; This software is provided 'as-is', without any express or implied 13; warranty. In no event will the authors be held liable for any damages 14; arising from the use of this software. 15; 16; Permission is granted to anyone to use this software for any purpose, 17; including commercial applications, and to alter it and redistribute it 18; freely, subject to the following restrictions: 19; 20; 1. The origin of this software must not be misrepresented; you must not 21; claim that you wrote the original software. If you use this software 22; in a product, an acknowledgment in the product documentation would be 23; appreciated but is not required. 24; 2. Altered source versions must be plainly marked as such, and must not be 25; misrepresented as being the original software. 26; 3. This notice may not be removed or altered from any source distribution. 27; 28; [TAB8] 29 30; ========================================================================== 31; System-dependent configurations 32 33%ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)-------- 34; * Microsoft Visual C++ 35; * MinGW (Minimalist GNU for Windows) 36; * CygWin 37; * LCC-Win32 38 39; -- segment definition -- 40; 41%ifdef __YASM_VER__ 42%define SEG_TEXT .text align=16 43%define SEG_CONST .rdata align=16 44%else 45%define SEG_TEXT .text align=16 public use32 class=CODE 46%define SEG_CONST .rdata align=16 public use32 class=CONST 47%endif 48 49%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)-------- 50; * Microsoft Visual C++ 51 52; -- segment definition -- 53; 54%ifdef __YASM_VER__ 55%define SEG_TEXT .text align=16 56%define SEG_CONST .rdata align=16 57%else 58%define SEG_TEXT .text align=16 public use64 class=CODE 59%define SEG_CONST .rdata align=16 public use64 class=CONST 60%endif 61%define EXTN(name) name ; foo() -> foo 62 63%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)---------- 64; * Borland C++ (Win32) 65 66; -- segment definition -- 67; 68%define SEG_TEXT _text align=16 public use32 class=CODE 69%define SEG_CONST _data align=16 public use32 class=DATA 70 71%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ 72; * Linux 73; * *BSD family Unix using elf format 74; * Unix System V, including Solaris x86, UnixWare and SCO Unix 75 76; mark stack as non-executable 77section .note.GNU-stack noalloc noexec nowrite progbits 78 79; -- segment definition -- 80; 81%ifdef __x86_64__ 82%define SEG_TEXT .text progbits align=16 83%define SEG_CONST .rodata progbits align=16 84%else 85%define SEG_TEXT .text progbits alloc exec nowrite align=16 86%define SEG_CONST .rodata progbits alloc noexec nowrite align=16 87%endif 88 89; To make the code position-independent, append -DPIC to the commandline 90; 91%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC 92%define EXTN(name) name ; foo() -> foo 93 94%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)---- 95; * Older Linux using a.out format (nasm -f aout -DAOUT ...) 96; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...) 97 98; -- segment definition -- 99; 100%define SEG_TEXT .text 101%define SEG_CONST .data 102 103; To make the code position-independent, append -DPIC to the commandline 104; 105%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC 106 107%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- 108; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format) 109 110; -- segment definition -- 111; 112%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why? 113%define SEG_CONST .rodata align=16 114 115; The generation of position-independent code (PIC) is the default on Darwin. 116; 117%define PIC 118%define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing 119 120%else ; ----(Other case)---------------------- 121 122; -- segment definition -- 123; 124%define SEG_TEXT .text 125%define SEG_CONST .data 126 127%endif ; ---------------------------------------------- 128 129; ========================================================================== 130 131; -------------------------------------------------------------------------- 132; Common types 133; 134%ifdef __x86_64__ 135%define POINTER qword ; general pointer type 136%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER) 137%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT 138%else 139%define POINTER dword ; general pointer type 140%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER) 141%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT 142%endif 143 144%define INT dword ; signed integer type 145%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT) 146%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT 147 148%define FP32 dword ; IEEE754 single 149%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32) 150%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT 151 152%define MMWORD qword ; int64 (MMX register) 153%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD) 154%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT 155 156; NASM is buggy and doesn't properly handle operand sizes for SSE 157; instructions, so for now we have to define XMMWORD as blank. 158%define XMMWORD ; int128 (SSE register) 159%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD) 160%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT 161 162; Similar hacks for when we load a dword or MMWORD into an xmm# register 163%define XMM_DWORD 164%define XMM_MMWORD 165 166%define SIZEOF_BYTE 1 ; sizeof(BYTE) 167%define SIZEOF_WORD 2 ; sizeof(WORD) 168%define SIZEOF_DWORD 4 ; sizeof(DWORD) 169%define SIZEOF_QWORD 8 ; sizeof(QWORD) 170%define SIZEOF_OWORD 16 ; sizeof(OWORD) 171 172%define BYTE_BIT 8 ; CHAR_BIT in C 173%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT 174%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT 175%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT 176%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT 177 178; -------------------------------------------------------------------------- 179; External Symbol Name 180; 181%ifndef EXTN 182# Android Modification: 183# The unmodified code from upstream appends an underscore to the front of 184# "name" here. It is unclear why. Before removing the underscore, the 185# code failed to link because the function names in the SIMD code did not 186# match the callers (because of the extra underscore). This fix only 187# applies to x86 SIMD code. x86_64 is handled properly by the code above. 188%define EXTN(name) name 189%endif 190 191; -------------------------------------------------------------------------- 192; Macros for position-independent code (PIC) support 193; 194%ifndef GOT_SYMBOL 195%undef PIC 196%endif 197 198%ifdef PIC ; ------------------------------------------- 199 200%ifidn GOT_SYMBOL,_MACHO_PIC_ ; -------------------- 201 202; At present, nasm doesn't seem to support PIC generation for Mach-O. 203; The PIC support code below is a little tricky. 204 205 SECTION SEG_CONST 206const_base: 207 208%define GOTOFF(got,sym) (got) + (sym) - const_base 209 210%imacro get_GOT 1 211 ; NOTE: this macro destroys ecx resister. 212 call %%geteip 213 add ecx, byte (%%ref - $) 214 jmp short %%adjust 215%%geteip: 216 mov ecx, POINTER [esp] 217 ret 218%%adjust: 219 push ebp 220 xor ebp,ebp ; ebp = 0 221%ifidni %1,ebx ; (%1 == ebx) 222 ; db 0x8D,0x9C + jmp near const_base = 223 ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32) 224 db 0x8D,0x9C ; 8D,9C 225 jmp near const_base ; E9,(const_base-%%ref) 226%%ref: 227%else ; (%1 != ebx) 228 ; db 0x8D,0x8C + jmp near const_base = 229 ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32) 230 db 0x8D,0x8C ; 8D,8C 231 jmp near const_base ; E9,(const_base-%%ref) 232%%ref: mov %1, ecx 233%endif ; (%1 == ebx) 234 pop ebp 235%endmacro 236 237%else ; GOT_SYMBOL != _MACHO_PIC_ ---------------- 238 239%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff 240 241%imacro get_GOT 1 242 extern GOT_SYMBOL 243 call %%geteip 244 add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc 245 jmp short %%done 246%%geteip: 247 mov %1, POINTER [esp] 248 ret 249%%done: 250%endmacro 251 252%endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- 253 254%imacro pushpic 1.nolist 255 push %1 256%endmacro 257%imacro poppic 1.nolist 258 pop %1 259%endmacro 260%imacro movpic 2.nolist 261 mov %1,%2 262%endmacro 263 264%else ; !PIC ----------------------------------------- 265 266%define GOTOFF(got,sym) (sym) 267 268%imacro get_GOT 1.nolist 269%endmacro 270%imacro pushpic 1.nolist 271%endmacro 272%imacro poppic 1.nolist 273%endmacro 274%imacro movpic 2.nolist 275%endmacro 276 277%endif ; PIC ----------------------------------------- 278 279; -------------------------------------------------------------------------- 280; Align the next instruction on {2,4,8,16,..}-byte boundary. 281; ".balign n,,m" in GNU as 282; 283%define MSKLE(x,y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16) 284%define FILLB(b,n) (($$-(b)) & ((n)-1)) 285 286%imacro alignx 1-2.nolist 0xFFFF 287%%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \ 288 db 0x90 ; nop 289 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \ 290 db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000] 291 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \ 292 db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] 293 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \ 294 db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] 295 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \ 296 db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00] 297 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \ 298 db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00] 299 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \ 300 db 0x8B,0xED ; mov ebp,ebp 301 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \ 302 db 0x90 ; nop 303%endmacro 304 305; Align the next data on {2,4,8,16,..}-byte boundary. 306; 307%imacro alignz 1.nolist 308 align %1, db 0 ; filling zeros 309%endmacro 310 311%ifdef __x86_64__ 312 313%ifdef WIN64 314 315%imacro collect_args 0 316 push r12 317 push r13 318 push r14 319 push r15 320 mov r10, rcx 321 mov r11, rdx 322 mov r12, r8 323 mov r13, r9 324 mov r14, [rax+48] 325 mov r15, [rax+56] 326 push rsi 327 push rdi 328 sub rsp, SIZEOF_XMMWORD 329 movaps XMMWORD [rsp], xmm6 330 sub rsp, SIZEOF_XMMWORD 331 movaps XMMWORD [rsp], xmm7 332%endmacro 333 334%imacro uncollect_args 0 335 movaps xmm7, XMMWORD [rsp] 336 add rsp, SIZEOF_XMMWORD 337 movaps xmm6, XMMWORD [rsp] 338 add rsp, SIZEOF_XMMWORD 339 pop rdi 340 pop rsi 341 pop r15 342 pop r14 343 pop r13 344 pop r12 345%endmacro 346 347%else 348 349%imacro collect_args 0 350 push r10 351 push r11 352 push r12 353 push r13 354 push r14 355 push r15 356 mov r10, rdi 357 mov r11, rsi 358 mov r12, rdx 359 mov r13, rcx 360 mov r14, r8 361 mov r15, r9 362%endmacro 363 364%imacro uncollect_args 0 365 pop r15 366 pop r14 367 pop r13 368 pop r12 369 pop r11 370 pop r10 371%endmacro 372 373%endif 374 375%endif 376 377; -------------------------------------------------------------------------- 378; Defines picked up from the C headers 379; 380%include "jsimdcfg.inc" 381 382; -------------------------------------------------------------------------- 383