1; 2; jsimdext.inc - common declarations 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2010, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library - version 1.02 8; 9; Copyright (C) 1999-2006, MIYASAKA Masaru. 10; 11; This software is provided 'as-is', without any express or implied 12; warranty. In no event will the authors be held liable for any damages 13; arising from the use of this software. 14; 15; Permission is granted to anyone to use this software for any purpose, 16; including commercial applications, and to alter it and redistribute it 17; freely, subject to the following restrictions: 18; 19; 1. The origin of this software must not be misrepresented; you must not 20; claim that you wrote the original software. If you use this software 21; in a product, an acknowledgment in the product documentation would be 22; appreciated but is not required. 23; 2. Altered source versions must be plainly marked as such, and must not be 24; misrepresented as being the original software. 25; 3. This notice may not be removed or altered from any source distribution. 26; 27; [TAB8] 28 29; ========================================================================== 30; System-dependent configurations 31 32%ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)-------- 33; * Microsoft Visual C++ 34; * MinGW (Minimalist GNU for Windows) 35; * CygWin 36; * LCC-Win32 37 38; -- segment definition -- 39; 40%ifdef __YASM_VER__ 41%define SEG_TEXT .text align=16 42%define SEG_CONST .rdata align=16 43%else 44%define SEG_TEXT .text align=16 public use32 class=CODE 45%define SEG_CONST .rdata align=16 public use32 class=CONST 46%endif 47 48%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)-------- 49; * Microsoft Visual C++ 50 51; -- segment definition -- 52; 53%ifdef __YASM_VER__ 54%define SEG_TEXT .text align=16 55%define SEG_CONST .rdata align=16 56%else 57%define SEG_TEXT .text align=16 public use64 class=CODE 58%define SEG_CONST .rdata align=16 public use64 class=CONST 59%endif 60%define EXTN(name) name ; foo() -> foo 61 62%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)---------- 63; * Borland C++ (Win32) 64 65; -- segment definition -- 66; 67%define SEG_TEXT _text align=16 public use32 class=CODE 68%define SEG_CONST _data align=16 public use32 class=DATA 69 70%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ 71; * Linux 72; * *BSD family Unix using elf format 73; * Unix System V, including Solaris x86, UnixWare and SCO Unix 74 75; mark stack as non-executable 76section .note.GNU-stack noalloc noexec nowrite progbits 77 78; -- segment definition -- 79; 80%ifdef __x86_64__ 81%define SEG_TEXT .text progbits align=16 82%define SEG_CONST .rodata progbits align=16 83%else 84%define SEG_TEXT .text progbits alloc exec nowrite align=16 85%define SEG_CONST .rodata progbits alloc noexec nowrite align=16 86%endif 87 88; To make the code position-independent, append -DPIC to the commandline 89; 90%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC 91%define EXTN(name) name ; foo() -> foo 92 93%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)---- 94; * Older Linux using a.out format (nasm -f aout -DAOUT ...) 95; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...) 96 97; -- segment definition -- 98; 99%define SEG_TEXT .text 100%define SEG_CONST .data 101 102; To make the code position-independent, append -DPIC to the commandline 103; 104%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC 105 106%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- 107; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format) 108 109; -- segment definition -- 110; 111%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why? 112%define SEG_CONST .rodata align=16 113 114; The generation of position-independent code (PIC) is the default on Darwin. 115; 116%define PIC 117%define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing 118 119%else ; ----(Other case)---------------------- 120 121; -- segment definition -- 122; 123%define SEG_TEXT .text 124%define SEG_CONST .data 125 126%endif ; ---------------------------------------------- 127 128; ========================================================================== 129 130; -------------------------------------------------------------------------- 131; Common types 132; 133%ifdef __x86_64__ 134%define POINTER qword ; general pointer type 135%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER) 136%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT 137%else 138%define POINTER dword ; general pointer type 139%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER) 140%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT 141%endif 142 143%define INT dword ; signed integer type 144%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT) 145%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT 146 147%define FP32 dword ; IEEE754 single 148%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32) 149%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT 150 151%define MMWORD qword ; int64 (MMX register) 152%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD) 153%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT 154 155; NASM is buggy and doesn't properly handle operand sizes for SSE 156; instructions, so for now we have to define XMMWORD as blank. 157%define XMMWORD ; int128 (SSE register) 158%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD) 159%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT 160 161; Similar hacks for when we load a dword or MMWORD into an xmm# register 162%define XMM_DWORD 163%define XMM_MMWORD 164 165%define SIZEOF_BYTE 1 ; sizeof(BYTE) 166%define SIZEOF_WORD 2 ; sizeof(WORD) 167%define SIZEOF_DWORD 4 ; sizeof(DWORD) 168%define SIZEOF_QWORD 8 ; sizeof(QWORD) 169%define SIZEOF_OWORD 16 ; sizeof(OWORD) 170 171%define BYTE_BIT 8 ; CHAR_BIT in C 172%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT 173%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT 174%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT 175%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT 176 177; -------------------------------------------------------------------------- 178; External Symbol Name 179; 180%ifndef EXTN 181# Android Modification: 182# The unmodified code from upstream appends an underscore to the front of 183# "name" here. It is unclear why. Before removing the underscore, the 184# code failed to link because the function names in the SIMD code did not 185# match the callers (because of the extra underscore). This fix only 186# applies to x86 SIMD code. x86_64 is handled properly by the code above. 187%define EXTN(name) name ; foo() -> _foo 188%endif 189 190; -------------------------------------------------------------------------- 191; Macros for position-independent code (PIC) support 192; 193%ifndef GOT_SYMBOL 194%undef PIC 195%endif 196 197%ifdef PIC ; ------------------------------------------- 198 199%ifidn GOT_SYMBOL,_MACHO_PIC_ ; -------------------- 200 201; At present, nasm doesn't seem to support PIC generation for Mach-O. 202; The PIC support code below is a little tricky. 203 204 SECTION SEG_CONST 205const_base: 206 207%define GOTOFF(got,sym) (got) + (sym) - const_base 208 209%imacro get_GOT 1 210 ; NOTE: this macro destroys ecx resister. 211 call %%geteip 212 add ecx, byte (%%ref - $) 213 jmp short %%adjust 214%%geteip: 215 mov ecx, POINTER [esp] 216 ret 217%%adjust: 218 push ebp 219 xor ebp,ebp ; ebp = 0 220%ifidni %1,ebx ; (%1 == ebx) 221 ; db 0x8D,0x9C + jmp near const_base = 222 ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32) 223 db 0x8D,0x9C ; 8D,9C 224 jmp near const_base ; E9,(const_base-%%ref) 225%%ref: 226%else ; (%1 != ebx) 227 ; db 0x8D,0x8C + jmp near const_base = 228 ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32) 229 db 0x8D,0x8C ; 8D,8C 230 jmp near const_base ; E9,(const_base-%%ref) 231%%ref: mov %1, ecx 232%endif ; (%1 == ebx) 233 pop ebp 234%endmacro 235 236%else ; GOT_SYMBOL != _MACHO_PIC_ ---------------- 237 238%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff 239 240%imacro get_GOT 1 241 extern GOT_SYMBOL 242 call %%geteip 243 add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc 244 jmp short %%done 245%%geteip: 246 mov %1, POINTER [esp] 247 ret 248%%done: 249%endmacro 250 251%endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- 252 253%imacro pushpic 1.nolist 254 push %1 255%endmacro 256%imacro poppic 1.nolist 257 pop %1 258%endmacro 259%imacro movpic 2.nolist 260 mov %1,%2 261%endmacro 262 263%else ; !PIC ----------------------------------------- 264 265%define GOTOFF(got,sym) (sym) 266 267%imacro get_GOT 1.nolist 268%endmacro 269%imacro pushpic 1.nolist 270%endmacro 271%imacro poppic 1.nolist 272%endmacro 273%imacro movpic 2.nolist 274%endmacro 275 276%endif ; PIC ----------------------------------------- 277 278; -------------------------------------------------------------------------- 279; Align the next instruction on {2,4,8,16,..}-byte boundary. 280; ".balign n,,m" in GNU as 281; 282%define MSKLE(x,y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16) 283%define FILLB(b,n) (($$-(b)) & ((n)-1)) 284 285%imacro alignx 1-2.nolist 0xFFFF 286%%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \ 287 db 0x90 ; nop 288 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \ 289 db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000] 290 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \ 291 db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] 292 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \ 293 db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] 294 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \ 295 db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00] 296 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \ 297 db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00] 298 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \ 299 db 0x8B,0xED ; mov ebp,ebp 300 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \ 301 db 0x90 ; nop 302%endmacro 303 304; Align the next data on {2,4,8,16,..}-byte boundary. 305; 306%imacro alignz 1.nolist 307 align %1, db 0 ; filling zeros 308%endmacro 309 310%ifdef __x86_64__ 311 312%ifdef WIN64 313 314%imacro collect_args 0 315 push r12 316 push r13 317 push r14 318 push r15 319 mov r10, rcx 320 mov r11, rdx 321 mov r12, r8 322 mov r13, r9 323 mov r14, [rax+48] 324 mov r15, [rax+56] 325 push rsi 326 push rdi 327 sub rsp, SIZEOF_XMMWORD 328 movaps XMMWORD [rsp], xmm6 329 sub rsp, SIZEOF_XMMWORD 330 movaps XMMWORD [rsp], xmm7 331%endmacro 332 333%imacro uncollect_args 0 334 movaps xmm7, XMMWORD [rsp] 335 add rsp, SIZEOF_XMMWORD 336 movaps xmm6, XMMWORD [rsp] 337 add rsp, SIZEOF_XMMWORD 338 pop rdi 339 pop rsi 340 pop r15 341 pop r14 342 pop r13 343 pop r12 344%endmacro 345 346%else 347 348%imacro collect_args 0 349 push r10 350 push r11 351 push r12 352 push r13 353 push r14 354 push r15 355 mov r10, rdi 356 mov r11, rsi 357 mov r12, rdx 358 mov r13, rcx 359 mov r14, r8 360 mov r15, r9 361%endmacro 362 363%imacro uncollect_args 0 364 pop r15 365 pop r14 366 pop r13 367 pop r12 368 pop r11 369 pop r10 370%endmacro 371 372%endif 373 374%endif 375 376; -------------------------------------------------------------------------- 377; Defines picked up from the C headers 378; 379%include "jsimdcfg.inc" 380 381; -------------------------------------------------------------------------- 382