1;
2; jsimdext.inc - common declarations
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2010, 2016, D. R. Commander.
6; Copyright (C) 2018, Matthieu Darbois.
7;
8; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
9;
10; Copyright (C) 1999-2006, MIYASAKA Masaru.
11;
12; This software is provided 'as-is', without any express or implied
13; warranty.  In no event will the authors be held liable for any damages
14; arising from the use of this software.
15;
16; Permission is granted to anyone to use this software for any purpose,
17; including commercial applications, and to alter it and redistribute it
18; freely, subject to the following restrictions:
19;
20; 1. The origin of this software must not be misrepresented; you must not
21;    claim that you wrote the original software. If you use this software
22;    in a product, an acknowledgment in the product documentation would be
23;    appreciated but is not required.
24; 2. Altered source versions must be plainly marked as such, and must not be
25;    misrepresented as being the original software.
26; 3. This notice may not be removed or altered from any source distribution.
27;
28; [TAB8]
29
30; ==========================================================================
31;  System-dependent configurations
32
33%ifdef WIN32    ; ----(nasm -fwin32 -DWIN32 ...)--------
34; * Microsoft Visual C++
35; * MinGW (Minimalist GNU for Windows)
36; * CygWin
37; * LCC-Win32
38
39; -- segment definition --
40;
41%ifdef __YASM_VER__
42%define SEG_TEXT   .text  align=32
43%define SEG_CONST  .rdata align=32
44%else
45%define SEG_TEXT   .text  align=32 public use32 class=CODE
46%define SEG_CONST  .rdata align=32 public use32 class=CONST
47%endif
48
49%elifdef WIN64  ; ----(nasm -fwin64 -DWIN64 ...)--------
50; * Microsoft Visual C++
51
52; -- segment definition --
53;
54%ifdef __YASM_VER__
55%define SEG_TEXT    .text  align=32
56%define SEG_CONST   .rdata align=32
57%else
58%define SEG_TEXT    .text  align=32 public use64 class=CODE
59%define SEG_CONST   .rdata align=32 public use64 class=CONST
60%endif
61%define EXTN(name)  name                ; foo() -> foo
62
63%elifdef OBJ32  ; ----(nasm -fobj -DOBJ32 ...)----------
64; * Borland C++ (Win32)
65
66; -- segment definition --
67;
68%define SEG_TEXT   _text align=32 public use32 class=CODE
69%define SEG_CONST  _data align=32 public use32 class=DATA
70
71%elifdef ELF    ; ----(nasm -felf[64] -DELF ...)------------
72; * Linux
73; * *BSD family Unix using elf format
74; * Unix System V, including Solaris x86, UnixWare and SCO Unix
75
76; mark stack as non-executable
77section .note.GNU-stack noalloc noexec nowrite progbits
78
79; -- segment definition --
80;
81%ifdef __x86_64__
82%define SEG_TEXT   .text   progbits align=32
83%define SEG_CONST  .rodata progbits align=32
84%else
85%define SEG_TEXT   .text   progbits alloc exec   nowrite align=32
86%define SEG_CONST  .rodata progbits alloc noexec nowrite align=32
87%endif
88
89; To make the code position-independent, append -DPIC to the commandline
90;
91%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_  ; ELF supports PIC
92%define EXTN(name)  name                   ; foo() -> foo
93
94%elifdef AOUT   ; ----(nasm -faoutb/aout -DAOUT ...)----
95; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
96; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
97
98; -- segment definition --
99;
100%define SEG_TEXT   .text
101%define SEG_CONST  .data
102
103; To make the code position-independent, append -DPIC to the commandline
104;
105%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_  ; BSD-style a.out supports PIC
106
107%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
108; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
109
110; -- segment definition --
111;
112%define SEG_TEXT   .text  ;align=32     ; nasm doesn't accept align=32. why?
113%define SEG_CONST  .rodata align=32
114
115; The generation of position-independent code (PIC) is the default on Darwin.
116;
117%define PIC
118%define GOT_SYMBOL  _MACHO_PIC_         ; Mach-O style code-relative addressing
119
120%else           ; ----(Other case)----------------------
121
122; -- segment definition --
123;
124%define SEG_TEXT   .text
125%define SEG_CONST  .data
126
127%endif          ; ----------------------------------------------
128
129; ==========================================================================
130
131; --------------------------------------------------------------------------
132;  Common types
133;
134%ifdef __x86_64__
135%define POINTER         qword           ; general pointer type
136%define SIZEOF_POINTER  SIZEOF_QWORD    ; sizeof(POINTER)
137%define POINTER_BIT     QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
138%else
139%define POINTER         dword           ; general pointer type
140%define SIZEOF_POINTER  SIZEOF_DWORD    ; sizeof(POINTER)
141%define POINTER_BIT     DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
142%endif
143
144%define INT             dword           ; signed integer type
145%define SIZEOF_INT      SIZEOF_DWORD    ; sizeof(INT)
146%define INT_BIT         DWORD_BIT       ; sizeof(INT)*BYTE_BIT
147
148%define FP32            dword           ; IEEE754 single
149%define SIZEOF_FP32     SIZEOF_DWORD    ; sizeof(FP32)
150%define FP32_BIT        DWORD_BIT       ; sizeof(FP32)*BYTE_BIT
151
152%define MMWORD          qword           ; int64  (MMX register)
153%define SIZEOF_MMWORD   SIZEOF_QWORD    ; sizeof(MMWORD)
154%define MMWORD_BIT      QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT
155
156; NASM is buggy and doesn't properly handle operand sizes for SSE
157; instructions, so for now we have to define XMMWORD as blank.
158%define XMMWORD                         ; int128 (SSE register)
159%define SIZEOF_XMMWORD  SIZEOF_OWORD    ; sizeof(XMMWORD)
160%define XMMWORD_BIT     OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT
161
162%define YMMWORD                         ; int256 (AVX register)
163%define SIZEOF_YMMWORD  SIZEOF_YWORD    ; sizeof(YMMWORD)
164%define YMMWORD_BIT     YWORD_BIT       ; sizeof(YMMWORD)*BYTE_BIT
165
166; Similar hacks for when we load a dword or MMWORD into an xmm# register
167%define XMM_DWORD
168%define XMM_MMWORD
169
170%define SIZEOF_BYTE   1                 ; sizeof(BYTE)
171%define SIZEOF_WORD   2                 ; sizeof(WORD)
172%define SIZEOF_DWORD  4                 ; sizeof(DWORD)
173%define SIZEOF_QWORD  8                 ; sizeof(QWORD)
174%define SIZEOF_OWORD  16                ; sizeof(OWORD)
175%define SIZEOF_YWORD  32                ; sizeof(YWORD)
176
177%define BYTE_BIT      8                 ; CHAR_BIT in C
178%define WORD_BIT      16                ; sizeof(WORD)*BYTE_BIT
179%define DWORD_BIT     32                ; sizeof(DWORD)*BYTE_BIT
180%define QWORD_BIT     64                ; sizeof(QWORD)*BYTE_BIT
181%define OWORD_BIT     128               ; sizeof(OWORD)*BYTE_BIT
182%define YWORD_BIT     256               ; sizeof(YWORD)*BYTE_BIT
183
184; --------------------------------------------------------------------------
185;  External Symbol Name
186;
187%ifndef EXTN
188%define EXTN(name)  _ %+ name           ; foo() -> _foo
189%endif
190
191; --------------------------------------------------------------------------
192;  Hidden symbols
193;
194%ifdef ELF      ; ----(nasm -felf[64] -DELF ...)--------
195%define GLOBAL_FUNCTION(name)  global EXTN(name):function hidden
196%define GLOBAL_DATA(name)      global EXTN(name):data hidden
197%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
198%ifdef __YASM_VER__
199%define GLOBAL_FUNCTION(name)  global EXTN(name):private_extern
200%define GLOBAL_DATA(name)      global EXTN(name):private_extern
201%endif
202%endif
203
204%ifndef GLOBAL_FUNCTION
205%define GLOBAL_FUNCTION(name)  global EXTN(name)
206%endif
207%ifndef GLOBAL_DATA
208%define GLOBAL_DATA(name)      global EXTN(name)
209%endif
210
211; --------------------------------------------------------------------------
212;  Macros for position-independent code (PIC) support
213;
214%ifndef GOT_SYMBOL
215%undef PIC
216%endif
217
218%ifdef PIC  ; -------------------------------------------
219
220%ifidn GOT_SYMBOL, _MACHO_PIC_  ; --------------------
221
222; At present, nasm doesn't seem to support PIC generation for Mach-O.
223; The PIC support code below is a little tricky.
224
225    SECTION     SEG_CONST
226const_base:
227
228%define GOTOFF(got, sym)  (got) + (sym) - const_base
229
230%imacro get_GOT 1
231    ; NOTE: this macro destroys ecx resister.
232    call        %%geteip
233    add         ecx, byte (%%ref - $)
234    jmp         short %%adjust
235%%geteip:
236    mov         ecx, POINTER [esp]
237    ret
238%%adjust:
239    push        ebp
240    xor         ebp, ebp                ; ebp = 0
241%ifidni %1, ebx  ; (%1 == ebx)
242    ; db 0x8D,0x9C + jmp near const_base =
243    ;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
244    db          0x8D, 0x9C              ; 8D,9C
245    jmp         near const_base         ; E9,(const_base-%%ref)
246%%ref:
247%else  ; (%1 != ebx)
248    ; db 0x8D,0x8C + jmp near const_base =
249    ;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
250    db          0x8D, 0x8C              ; 8D,8C
251    jmp         near const_base         ; E9,(const_base-%%ref)
252%%ref:
253    mov         %1, ecx
254%endif  ; (%1 == ebx)
255    pop         ebp
256%endmacro
257
258%else     ; GOT_SYMBOL != _MACHO_PIC_ ----------------
259
260%define GOTOFF(got, sym)  (got) + (sym) wrt ..gotoff
261
262%imacro get_GOT 1
263    extern      GOT_SYMBOL
264    call        %%geteip
265    add         %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
266    jmp         short %%done
267%%geteip:
268    mov         %1, POINTER [esp]
269    ret
270%%done:
271%endmacro
272
273%endif    ; GOT_SYMBOL == _MACHO_PIC_ ----------------
274
275%imacro pushpic 1.nolist
276    push        %1
277%endmacro
278%imacro poppic  1.nolist
279    pop         %1
280%endmacro
281%imacro movpic  2.nolist
282    mov         %1, %2
283%endmacro
284
285%else    ; !PIC -----------------------------------------
286
287%define GOTOFF(got, sym)  (sym)
288
289%imacro get_GOT 1.nolist
290%endmacro
291%imacro pushpic 1.nolist
292%endmacro
293%imacro poppic  1.nolist
294%endmacro
295%imacro movpic  2.nolist
296%endmacro
297
298%endif   ;  PIC -----------------------------------------
299
300; --------------------------------------------------------------------------
301;  Align the next instruction on {2,4,8,16,..}-byte boundary.
302;  ".balign n,,m" in GNU as
303;
304%define MSKLE(x, y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
305%define FILLB(b, n)  (($$-(b)) & ((n)-1))
306
307%imacro alignx 1-2.nolist 0xFFFF
308%%bs: \
309  times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \
310        db 0x90                                      ; nop
311  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \
312        db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00  ; lea ebx,[ebx+0x00000000]
313  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \
314        db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00  ; lea ebp,[ebp+0x00000000]
315  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \
316        db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00        ; lea ebp,[ebp+0x00000000]
317  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \
318        db 0x8D, 0x6C, 0x25, 0x00                    ; lea ebp,[ebp+0x00]
319  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \
320        db 0x8D, 0x6D, 0x00                          ; lea ebp,[ebp+0x00]
321  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \
322        db 0x8B, 0xED                                ; mov ebp,ebp
323  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \
324        db 0x90                                      ; nop
325%endmacro
326
327; Align the next data on {2,4,8,16,..}-byte boundary.
328;
329%imacro alignz 1.nolist
330    align       %1, db 0                ; filling zeros
331%endmacro
332
333%ifdef __x86_64__
334
335%ifdef WIN64
336
337%imacro collect_args 1
338    sub         rsp, SIZEOF_XMMWORD
339    movaps      XMMWORD [rsp], xmm6
340    sub         rsp, SIZEOF_XMMWORD
341    movaps      XMMWORD [rsp], xmm7
342    mov         r10, rcx
343%if %1 > 1
344    mov         r11, rdx
345%endif
346%if %1 > 2
347    push        r12
348    mov         r12, r8
349%endif
350%if %1 > 3
351    push        r13
352    mov         r13, r9
353%endif
354%if %1 > 4
355    push        r14
356    mov         r14, [rax+48]
357%endif
358%if %1 > 5
359    push        r15
360    mov         r15, [rax+56]
361%endif
362    push        rsi
363    push        rdi
364%endmacro
365
366%imacro uncollect_args 1
367    pop         rdi
368    pop         rsi
369%if %1 > 5
370    pop         r15
371%endif
372%if %1 > 4
373    pop         r14
374%endif
375%if %1 > 3
376    pop         r13
377%endif
378%if %1 > 2
379    pop         r12
380%endif
381    movaps      xmm7, XMMWORD [rsp]
382    add         rsp, SIZEOF_XMMWORD
383    movaps      xmm6, XMMWORD [rsp]
384    add         rsp, SIZEOF_XMMWORD
385%endmacro
386
387%imacro push_xmm 1
388    sub         rsp, %1 * SIZEOF_XMMWORD
389    movaps      XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8
390%if %1 > 1
391    movaps      XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9
392%endif
393%if %1 > 2
394    movaps      XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10
395%endif
396%if %1 > 3
397    movaps      XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11
398%endif
399%endmacro
400
401%imacro pop_xmm 1
402    movaps      xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD]
403%if %1 > 1
404    movaps      xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD]
405%endif
406%if %1 > 2
407    movaps      xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD]
408%endif
409%if %1 > 3
410    movaps      xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD]
411%endif
412    add         rsp, %1 * SIZEOF_XMMWORD
413%endmacro
414
415%else
416
417%imacro collect_args 1
418    push        r10
419    mov         r10, rdi
420%if %1 > 1
421    push        r11
422    mov         r11, rsi
423%endif
424%if %1 > 2
425    push        r12
426    mov         r12, rdx
427%endif
428%if %1 > 3
429    push        r13
430    mov         r13, rcx
431%endif
432%if %1 > 4
433    push        r14
434    mov         r14, r8
435%endif
436%if %1 > 5
437    push        r15
438    mov         r15, r9
439%endif
440%endmacro
441
442%imacro uncollect_args 1
443%if %1 > 5
444    pop         r15
445%endif
446%if %1 > 4
447    pop         r14
448%endif
449%if %1 > 3
450    pop         r13
451%endif
452%if %1 > 2
453    pop         r12
454%endif
455%if %1 > 1
456    pop         r11
457%endif
458    pop         r10
459%endmacro
460
461%imacro push_xmm 1
462%endmacro
463
464%imacro pop_xmm 1
465%endmacro
466
467%endif
468
469%endif
470
471; --------------------------------------------------------------------------
472;  Defines picked up from the C headers
473;
474%include "jsimdcfg.inc"
475
476; --------------------------------------------------------------------------
477