1;
2; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright 2009 D. R. Commander
6;
7; Based on
8; x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17;
18; [TAB8]
19
20%include "jsimdext.inc"
21%include "jdct.inc"
22
23; --------------------------------------------------------------------------
24        SECTION SEG_TEXT
25        BITS    64
26;
27; Load data into workspace, applying unsigned->signed conversion
28;
29; GLOBAL(void)
30; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
31;                            FAST_FLOAT * workspace);
32;
33
34; r10 = JSAMPARRAY sample_data
35; r11 = JDIMENSION start_col
36; r12 = FAST_FLOAT * workspace
37
38        align   16
39        global  EXTN(jsimd_convsamp_float_sse2)
40
41EXTN(jsimd_convsamp_float_sse2):
42        push    rbp
43        mov     rax,rsp
44        mov     rbp,rsp
45        collect_args
46        push    rbx
47
48        pcmpeqw  xmm7,xmm7
49        psllw    xmm7,7
50        packsswb xmm7,xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
51
52        mov rsi, r10
53        mov     eax, r11d
54        mov rdi, r12
55        mov     rcx, DCTSIZE/2
56.convloop:
57        mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
58        mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]       ; (JSAMPLE *)
59
60        movq    xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
61        movq    xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
62
63        psubb   xmm0,xmm7                       ; xmm0=(01234567)
64        psubb   xmm1,xmm7                       ; xmm1=(89ABCDEF)
65
66        punpcklbw xmm0,xmm0                     ; xmm0=(*0*1*2*3*4*5*6*7)
67        punpcklbw xmm1,xmm1                     ; xmm1=(*8*9*A*B*C*D*E*F)
68
69        punpcklwd xmm2,xmm0                     ; xmm2=(***0***1***2***3)
70        punpckhwd xmm0,xmm0                     ; xmm0=(***4***5***6***7)
71        punpcklwd xmm3,xmm1                     ; xmm3=(***8***9***A***B)
72        punpckhwd xmm1,xmm1                     ; xmm1=(***C***D***E***F)
73
74        psrad     xmm2,(DWORD_BIT-BYTE_BIT)     ; xmm2=(0123)
75        psrad     xmm0,(DWORD_BIT-BYTE_BIT)     ; xmm0=(4567)
76        cvtdq2ps  xmm2,xmm2                     ; xmm2=(0123)
77        cvtdq2ps  xmm0,xmm0                     ; xmm0=(4567)
78        psrad     xmm3,(DWORD_BIT-BYTE_BIT)     ; xmm3=(89AB)
79        psrad     xmm1,(DWORD_BIT-BYTE_BIT)     ; xmm1=(CDEF)
80        cvtdq2ps  xmm3,xmm3                     ; xmm3=(89AB)
81        cvtdq2ps  xmm1,xmm1                     ; xmm1=(CDEF)
82
83        movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
84        movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
85        movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
86        movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
87
88        add     rsi, byte 2*SIZEOF_JSAMPROW
89        add     rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
90        dec     rcx
91        jnz     short .convloop
92
93        pop     rbx
94        uncollect_args
95        pop     rbp
96        ret
97
98
99; --------------------------------------------------------------------------
100;
101; Quantize/descale the coefficients, and store into coef_block
102;
103; GLOBAL(void)
104; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
105;                         FAST_FLOAT * workspace);
106;
107
108; r10 = JCOEFPTR coef_block
109; r11 = FAST_FLOAT * divisors
110; r12 = FAST_FLOAT * workspace
111
112        align   16
113        global  EXTN(jsimd_quantize_float_sse2)
114
115EXTN(jsimd_quantize_float_sse2):
116        push    rbp
117        mov     rax,rsp
118        mov     rbp,rsp
119        collect_args
120
121        mov rsi, r12
122        mov rdx, r11
123        mov rdi, r10
124        mov     rax, DCTSIZE2/16
125.quantloop:
126        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
127        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
128        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
129        mulps   xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
130        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
131        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
132        mulps   xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
133        mulps   xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
134
135        cvtps2dq xmm0,xmm0
136        cvtps2dq xmm1,xmm1
137        cvtps2dq xmm2,xmm2
138        cvtps2dq xmm3,xmm3
139
140        packssdw xmm0,xmm1
141        packssdw xmm2,xmm3
142
143        movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
144        movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
145
146        add     rsi, byte 16*SIZEOF_FAST_FLOAT
147        add     rdx, byte 16*SIZEOF_FAST_FLOAT
148        add     rdi, byte 16*SIZEOF_JCOEF
149        dec     rax
150        jnz     short .quantloop
151
152        uncollect_args
153        pop     rbp
154        ret
155
156; For some reason, the OS X linker does not honor the request to align the
157; segment unless we do this.
158        align   16
159