1;
2; jsimdext.inc - common declarations
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2010, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
8;
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10;
11; This software is provided 'as-is', without any express or implied
12; warranty.  In no event will the authors be held liable for any damages
13; arising from the use of this software.
14;
15; Permission is granted to anyone to use this software for any purpose,
16; including commercial applications, and to alter it and redistribute it
17; freely, subject to the following restrictions:
18;
19; 1. The origin of this software must not be misrepresented; you must not
20;    claim that you wrote the original software. If you use this software
21;    in a product, an acknowledgment in the product documentation would be
22;    appreciated but is not required.
23; 2. Altered source versions must be plainly marked as such, and must not be
24;    misrepresented as being the original software.
25; 3. This notice may not be removed or altered from any source distribution.
26;
27; [TAB8]
28
29; ==========================================================================
30;  System-dependent configurations
31
32%ifdef WIN32    ; ----(nasm -fwin32 -DWIN32 ...)--------
33; * Microsoft Visual C++
34; * MinGW (Minimalist GNU for Windows)
35; * CygWin
36; * LCC-Win32
37
38; -- segment definition --
39;
40%ifdef __YASM_VER__
41%define SEG_TEXT    .text  align=16
42%define SEG_CONST   .rdata align=16
43%else
44%define SEG_TEXT    .text  align=16 public use32 class=CODE
45%define SEG_CONST   .rdata align=16 public use32 class=CONST
46%endif
47
48%elifdef WIN64  ; ----(nasm -fwin64 -DWIN64 ...)--------
49; * Microsoft Visual C++
50
51; -- segment definition --
52;
53%ifdef __YASM_VER__
54%define SEG_TEXT    .text  align=16
55%define SEG_CONST   .rdata align=16
56%else
57%define SEG_TEXT    .text  align=16 public use64 class=CODE
58%define SEG_CONST   .rdata align=16 public use64 class=CONST
59%endif
60%define EXTN(name)  name                        ; foo() -> foo
61
62%elifdef OBJ32  ; ----(nasm -fobj -DOBJ32 ...)----------
63; * Borland C++ (Win32)
64
65; -- segment definition --
66;
67%define SEG_TEXT    _text  align=16 public use32 class=CODE
68%define SEG_CONST   _data  align=16 public use32 class=DATA
69
70%elifdef ELF    ; ----(nasm -felf[64] -DELF ...)------------
71; * Linux
72; * *BSD family Unix using elf format
73; * Unix System V, including Solaris x86, UnixWare and SCO Unix
74
75; mark stack as non-executable
76section .note.GNU-stack noalloc noexec nowrite progbits
77
78; -- segment definition --
79;
80%ifdef __x86_64__
81%define SEG_TEXT    .text   progbits align=16
82%define SEG_CONST   .rodata progbits align=16
83%else
84%define SEG_TEXT    .text   progbits alloc exec   nowrite align=16
85%define SEG_CONST   .rodata progbits alloc noexec nowrite align=16
86%endif
87
88; To make the code position-independent, append -DPIC to the commandline
89;
90%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_       ; ELF supports PIC
91%define EXTN(name)  name                        ; foo() -> foo
92
93%elifdef AOUT   ; ----(nasm -faoutb/aout -DAOUT ...)----
94; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
95; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
96
97; -- segment definition --
98;
99%define SEG_TEXT    .text
100%define SEG_CONST   .data
101
102; To make the code position-independent, append -DPIC to the commandline
103;
104%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_      ; BSD-style a.out supports PIC
105
106%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
107; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
108
109; -- segment definition --
110;
111%define SEG_TEXT    .text  ;align=16    ; nasm doesn't accept align=16. why?
112%define SEG_CONST   .rodata align=16
113
114; The generation of position-independent code (PIC) is the default on Darwin.
115;
116%define PIC
117%define GOT_SYMBOL  _MACHO_PIC_         ; Mach-O style code-relative addressing
118
119%else           ; ----(Other case)----------------------
120
121; -- segment definition --
122;
123%define SEG_TEXT    .text
124%define SEG_CONST   .data
125
126%endif  ; ----------------------------------------------
127
128; ==========================================================================
129
130; --------------------------------------------------------------------------
131;  Common types
132;
133%ifdef __x86_64__
134%define POINTER                 qword           ; general pointer type
135%define SIZEOF_POINTER          SIZEOF_QWORD    ; sizeof(POINTER)
136%define POINTER_BIT             QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
137%else
138%define POINTER                 dword           ; general pointer type
139%define SIZEOF_POINTER          SIZEOF_DWORD    ; sizeof(POINTER)
140%define POINTER_BIT             DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
141%endif
142
143%define INT                     dword           ; signed integer type
144%define SIZEOF_INT              SIZEOF_DWORD    ; sizeof(INT)
145%define INT_BIT                 DWORD_BIT       ; sizeof(INT)*BYTE_BIT
146
147%define FP32                    dword           ; IEEE754 single
148%define SIZEOF_FP32             SIZEOF_DWORD    ; sizeof(FP32)
149%define FP32_BIT                DWORD_BIT       ; sizeof(FP32)*BYTE_BIT
150
151%define MMWORD                  qword           ; int64  (MMX register)
152%define SIZEOF_MMWORD           SIZEOF_QWORD    ; sizeof(MMWORD)
153%define MMWORD_BIT              QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT
154
155; NASM is buggy and doesn't properly handle operand sizes for SSE
156; instructions, so for now we have to define XMMWORD as blank.
157%define XMMWORD                                 ; int128 (SSE register)
158%define SIZEOF_XMMWORD          SIZEOF_OWORD    ; sizeof(XMMWORD)
159%define XMMWORD_BIT             OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT
160
161; Similar hacks for when we load a dword or MMWORD into an xmm# register
162%define XMM_DWORD
163%define XMM_MMWORD
164
165%define SIZEOF_BYTE             1               ; sizeof(BYTE)
166%define SIZEOF_WORD             2               ; sizeof(WORD)
167%define SIZEOF_DWORD            4               ; sizeof(DWORD)
168%define SIZEOF_QWORD            8               ; sizeof(QWORD)
169%define SIZEOF_OWORD            16              ; sizeof(OWORD)
170
171%define BYTE_BIT                8               ; CHAR_BIT in C
172%define WORD_BIT                16              ; sizeof(WORD)*BYTE_BIT
173%define DWORD_BIT               32              ; sizeof(DWORD)*BYTE_BIT
174%define QWORD_BIT               64              ; sizeof(QWORD)*BYTE_BIT
175%define OWORD_BIT               128             ; sizeof(OWORD)*BYTE_BIT
176
177; --------------------------------------------------------------------------
178;  External Symbol Name
179;
180%ifndef EXTN
181# Android Modification:
182# The unmodified code from upstream appends an underscore to the front of
183# "name" here.  It is unclear why.  Before removing the underscore, the
184# code failed to link because the function names in the SIMD code did not
185# match the callers (because of the extra underscore).  This fix only
186# applies to x86 SIMD code.  x86_64 is handled properly by the code above.
187%define EXTN(name)   name          ; foo() -> _foo
188%endif
189
190; --------------------------------------------------------------------------
191;  Macros for position-independent code (PIC) support
192;
193%ifndef GOT_SYMBOL
194%undef PIC
195%endif
196
197%ifdef PIC ; -------------------------------------------
198
199%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
200
201; At present, nasm doesn't seem to support PIC generation for Mach-O.
202; The PIC support code below is a little tricky.
203
204        SECTION SEG_CONST
205const_base:
206
207%define GOTOFF(got,sym) (got) + (sym) - const_base
208
209%imacro get_GOT 1
210        ; NOTE: this macro destroys ecx resister.
211        call    %%geteip
212        add     ecx, byte (%%ref - $)
213        jmp     short %%adjust
214%%geteip:
215        mov     ecx, POINTER [esp]
216        ret
217%%adjust:
218        push    ebp
219        xor     ebp,ebp         ; ebp = 0
220%ifidni %1,ebx  ; (%1 == ebx)
221        ; db 0x8D,0x9C + jmp near const_base =
222        ;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
223        db      0x8D,0x9C               ; 8D,9C
224        jmp     near const_base         ; E9,(const_base-%%ref)
225%%ref:
226%else  ; (%1 != ebx)
227        ; db 0x8D,0x8C + jmp near const_base =
228        ;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
229        db      0x8D,0x8C               ; 8D,8C
230        jmp     near const_base         ; E9,(const_base-%%ref)
231%%ref:  mov     %1, ecx
232%endif ; (%1 == ebx)
233        pop     ebp
234%endmacro
235
236%else   ; GOT_SYMBOL != _MACHO_PIC_ ----------------
237
238%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
239
240%imacro get_GOT 1
241        extern  GOT_SYMBOL
242        call    %%geteip
243        add     %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
244        jmp     short %%done
245%%geteip:
246        mov     %1, POINTER [esp]
247        ret
248%%done:
249%endmacro
250
251%endif  ; GOT_SYMBOL == _MACHO_PIC_ ----------------
252
253%imacro pushpic 1.nolist
254        push    %1
255%endmacro
256%imacro poppic  1.nolist
257        pop     %1
258%endmacro
259%imacro movpic  2.nolist
260        mov     %1,%2
261%endmacro
262
263%else   ; !PIC -----------------------------------------
264
265%define GOTOFF(got,sym) (sym)
266
267%imacro get_GOT 1.nolist
268%endmacro
269%imacro pushpic 1.nolist
270%endmacro
271%imacro poppic  1.nolist
272%endmacro
273%imacro movpic  2.nolist
274%endmacro
275
276%endif  ;  PIC -----------------------------------------
277
278; --------------------------------------------------------------------------
279;  Align the next instruction on {2,4,8,16,..}-byte boundary.
280;  ".balign n,,m" in GNU as
281;
282%define MSKLE(x,y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
283%define FILLB(b,n)  (($$-(b)) & ((n)-1))
284
285%imacro alignx 1-2.nolist 0xFFFF
286%%bs:   times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
287               db 0x90                               ; nop
288        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
289               db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
290        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
291               db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
292        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
293               db 0x8D,0xAD,0x00,0x00,0x00,0x00      ; lea ebp,[ebp+0x00000000]
294        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
295               db 0x8D,0x6C,0x25,0x00                ; lea ebp,[ebp+0x00]
296        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
297               db 0x8D,0x6D,0x00                     ; lea ebp,[ebp+0x00]
298        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
299               db 0x8B,0xED                          ; mov ebp,ebp
300        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
301               db 0x90                               ; nop
302%endmacro
303
304; Align the next data on {2,4,8,16,..}-byte boundary.
305;
306%imacro alignz 1.nolist
307        align %1, db 0          ; filling zeros
308%endmacro
309
310%ifdef __x86_64__
311
312%ifdef WIN64
313
314%imacro collect_args 0
315        push r12
316        push r13
317        push r14
318        push r15
319        mov r10, rcx
320        mov r11, rdx
321        mov r12, r8
322        mov r13, r9
323        mov r14, [rax+48]
324        mov r15, [rax+56]
325        push rsi
326        push rdi
327        sub     rsp, SIZEOF_XMMWORD
328        movaps  XMMWORD [rsp], xmm6
329        sub     rsp, SIZEOF_XMMWORD
330        movaps  XMMWORD [rsp], xmm7
331%endmacro
332
333%imacro uncollect_args 0
334        movaps  xmm7, XMMWORD [rsp]
335        add     rsp, SIZEOF_XMMWORD
336        movaps  xmm6, XMMWORD [rsp]
337        add     rsp, SIZEOF_XMMWORD
338        pop rdi
339        pop rsi
340        pop r15
341        pop r14
342        pop r13
343        pop r12
344%endmacro
345
346%else
347
348%imacro collect_args 0
349        push r10
350        push r11
351        push r12
352        push r13
353        push r14
354        push r15
355        mov r10, rdi
356        mov r11, rsi
357        mov r12, rdx
358        mov r13, rcx
359        mov r14, r8
360        mov r15, r9
361%endmacro
362
363%imacro uncollect_args 0
364        pop r15
365        pop r14
366        pop r13
367        pop r12
368        pop r11
369        pop r10
370%endmacro
371
372%endif
373
374%endif
375
376; --------------------------------------------------------------------------
377;  Defines picked up from the C headers
378;
379%include "jsimdcfg.inc"
380
381; --------------------------------------------------------------------------
382