1;
2; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2)
3;
4; Copyright (C) 2016, 2018, Matthieu Darbois
5;
6; Based on the x86 SIMD extension for IJG JPEG library
7; Copyright (C) 1999-2006, MIYASAKA Masaru.
8; For conditions of distribution and use, see copyright notice in jsimdext.inc
9;
10; This file should be assembled with NASM (Netwide Assembler),
11; can *not* be assembled with Microsoft's MASM or any compatible
12; assembler (including Borland's Turbo Assembler).
13; NASM is available from http://nasm.sourceforge.net/ or
14; http://sourceforge.net/project/showfiles.php?group_id=6208
15;
16; This file contains an SSE2 implementation of data preparation for progressive
17; Huffman encoding.  See jcphuff.c for more details.
18;
19; [TAB8]
20
21%include "jsimdext.inc"
22
23; --------------------------------------------------------------------------
24    SECTION     SEG_TEXT
25    BITS        32
26
27; --------------------------------------------------------------------------
28; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
29; jsimd_encode_mcu_AC_refine_prepare_sse2()
30
31%macro LOAD16 0
32    pxor        N0, N0
33    pxor        N1, N1
34
35    mov         T0, INT [LUT +  0*SIZEOF_INT]
36    mov         T1, INT [LUT +  8*SIZEOF_INT]
37    pinsrw      X0, word [BLOCK + T0 * 2], 0
38    pinsrw      X1, word [BLOCK + T1 * 2], 0
39
40    mov         T0, INT [LUT +  1*SIZEOF_INT]
41    mov         T1, INT [LUT +  9*SIZEOF_INT]
42    pinsrw      X0, word [BLOCK + T0 * 2], 1
43    pinsrw      X1, word [BLOCK + T1 * 2], 1
44
45    mov         T0, INT [LUT +  2*SIZEOF_INT]
46    mov         T1, INT [LUT + 10*SIZEOF_INT]
47    pinsrw      X0, word [BLOCK + T0 * 2], 2
48    pinsrw      X1, word [BLOCK + T1 * 2], 2
49
50    mov         T0, INT [LUT +  3*SIZEOF_INT]
51    mov         T1, INT [LUT + 11*SIZEOF_INT]
52    pinsrw      X0, word [BLOCK + T0 * 2], 3
53    pinsrw      X1, word [BLOCK + T1 * 2], 3
54
55    mov         T0, INT [LUT +  4*SIZEOF_INT]
56    mov         T1, INT [LUT + 12*SIZEOF_INT]
57    pinsrw      X0, word [BLOCK + T0 * 2], 4
58    pinsrw      X1, word [BLOCK + T1 * 2], 4
59
60    mov         T0, INT [LUT +  5*SIZEOF_INT]
61    mov         T1, INT [LUT + 13*SIZEOF_INT]
62    pinsrw      X0, word [BLOCK + T0 * 2], 5
63    pinsrw      X1, word [BLOCK + T1 * 2], 5
64
65    mov         T0, INT [LUT +  6*SIZEOF_INT]
66    mov         T1, INT [LUT + 14*SIZEOF_INT]
67    pinsrw      X0, word [BLOCK + T0 * 2], 6
68    pinsrw      X1, word [BLOCK + T1 * 2], 6
69
70    mov         T0, INT [LUT +  7*SIZEOF_INT]
71    mov         T1, INT [LUT + 15*SIZEOF_INT]
72    pinsrw      X0, word [BLOCK + T0 * 2], 7
73    pinsrw      X1, word [BLOCK + T1 * 2], 7
74%endmacro
75
76%macro LOAD15 0
77    pxor        N0, N0
78    pxor        N1, N1
79    pxor        X1, X1
80
81    mov         T0, INT [LUT +  0*SIZEOF_INT]
82    mov         T1, INT [LUT +  8*SIZEOF_INT]
83    pinsrw      X0, word [BLOCK + T0 * 2], 0
84    pinsrw      X1, word [BLOCK + T1 * 2], 0
85
86    mov         T0, INT [LUT +  1*SIZEOF_INT]
87    pinsrw      X0, word [BLOCK + T0 * 2], 1
88
89    mov         T0, INT [LUT +  2*SIZEOF_INT]
90    pinsrw      X0, word [BLOCK + T0 * 2], 2
91
92    mov         T0, INT [LUT +  3*SIZEOF_INT]
93    pinsrw      X0, word [BLOCK + T0 * 2], 3
94
95    mov         T0, INT [LUT +  4*SIZEOF_INT]
96    pinsrw      X0, word [BLOCK + T0 * 2], 4
97
98    mov         T0, INT [LUT +  5*SIZEOF_INT]
99    pinsrw      X0, word [BLOCK + T0 * 2], 5
100
101    mov         T0, INT [LUT +  6*SIZEOF_INT]
102    pinsrw      X0, word [BLOCK + T0 * 2], 6
103
104    mov         T0, INT [LUT +  7*SIZEOF_INT]
105    pinsrw      X0, word [BLOCK + T0 * 2], 7
106
107    cmp         LENEND, 2
108    jl          %%.ELOAD15
109    mov         T1, INT [LUT +  9*SIZEOF_INT]
110    pinsrw      X1, word [BLOCK + T1 * 2], 1
111
112    cmp         LENEND, 3
113    jl          %%.ELOAD15
114    mov         T1, INT [LUT + 10*SIZEOF_INT]
115    pinsrw      X1, word [BLOCK + T1 * 2], 2
116
117    cmp         LENEND, 4
118    jl          %%.ELOAD15
119    mov         T1, INT [LUT + 11*SIZEOF_INT]
120    pinsrw      X1, word [BLOCK + T1 * 2], 3
121
122    cmp         LENEND, 5
123    jl          %%.ELOAD15
124    mov         T1, INT [LUT + 12*SIZEOF_INT]
125    pinsrw      X1, word [BLOCK + T1 * 2], 4
126
127    cmp         LENEND, 6
128    jl          %%.ELOAD15
129    mov         T1, INT [LUT + 13*SIZEOF_INT]
130    pinsrw      X1, word [BLOCK + T1 * 2], 5
131
132    cmp         LENEND, 7
133    jl          %%.ELOAD15
134    mov         T1, INT [LUT + 14*SIZEOF_INT]
135    pinsrw      X1, word [BLOCK + T1 * 2], 6
136%%.ELOAD15:
137%endmacro
138
139%macro LOAD8 0
140    pxor        N0, N0
141
142    mov         T0, INT [LUT +  0*SIZEOF_INT]
143    pinsrw      X0, word [BLOCK + T0 * 2], 0
144
145    mov         T0, INT [LUT +  1*SIZEOF_INT]
146    pinsrw      X0, word [BLOCK + T0 * 2], 1
147
148    mov         T0, INT [LUT +  2*SIZEOF_INT]
149    pinsrw      X0, word [BLOCK + T0 * 2], 2
150
151    mov         T0, INT [LUT +  3*SIZEOF_INT]
152    pinsrw      X0, word [BLOCK + T0 * 2], 3
153
154    mov         T0, INT [LUT +  4*SIZEOF_INT]
155    pinsrw      X0, word [BLOCK + T0 * 2], 4
156
157    mov         T0, INT [LUT +  5*SIZEOF_INT]
158    pinsrw      X0, word [BLOCK + T0 * 2], 5
159
160    mov         T0, INT [LUT +  6*SIZEOF_INT]
161    pinsrw      X0, word [BLOCK + T0 * 2], 6
162
163    mov         T0, INT [LUT +  7*SIZEOF_INT]
164    pinsrw      X0, word [BLOCK + T0 * 2], 7
165%endmacro
166
167%macro LOAD7 0
168    pxor        N0, N0
169    pxor        X0, X0
170
171    mov         T1, INT [LUT +  0*SIZEOF_INT]
172    pinsrw      X0, word [BLOCK + T1 * 2], 0
173
174    cmp         LENEND, 2
175    jl          %%.ELOAD7
176    mov         T1, INT [LUT +  1*SIZEOF_INT]
177    pinsrw      X0, word [BLOCK + T1 * 2], 1
178
179    cmp         LENEND, 3
180    jl          %%.ELOAD7
181    mov         T1, INT [LUT +  2*SIZEOF_INT]
182    pinsrw      X0, word [BLOCK + T1 * 2], 2
183
184    cmp         LENEND, 4
185    jl          %%.ELOAD7
186    mov         T1, INT [LUT +  3*SIZEOF_INT]
187    pinsrw      X0, word [BLOCK + T1 * 2], 3
188
189    cmp         LENEND, 5
190    jl          %%.ELOAD7
191    mov         T1, INT [LUT +  4*SIZEOF_INT]
192    pinsrw      X0, word [BLOCK + T1 * 2], 4
193
194    cmp         LENEND, 6
195    jl          %%.ELOAD7
196    mov         T1, INT [LUT +  5*SIZEOF_INT]
197    pinsrw      X0, word [BLOCK + T1 * 2], 5
198
199    cmp         LENEND, 7
200    jl          %%.ELOAD7
201    mov         T1, INT [LUT +  6*SIZEOF_INT]
202    pinsrw      X0, word [BLOCK + T1 * 2], 6
203%%.ELOAD7:
204%endmacro
205
206%macro REDUCE0 0
207    movdqa      xmm0, XMMWORD [VALUES + ( 0*2)]
208    movdqa      xmm1, XMMWORD [VALUES + ( 8*2)]
209    movdqa      xmm2, XMMWORD [VALUES + (16*2)]
210    movdqa      xmm3, XMMWORD [VALUES + (24*2)]
211    movdqa      xmm4, XMMWORD [VALUES + (32*2)]
212    movdqa      xmm5, XMMWORD [VALUES + (40*2)]
213    movdqa      xmm6, XMMWORD [VALUES + (48*2)]
214
215    pcmpeqw     xmm0, ZERO
216    pcmpeqw     xmm1, ZERO
217    pcmpeqw     xmm2, ZERO
218    pcmpeqw     xmm3, ZERO
219    pcmpeqw     xmm4, ZERO
220    pcmpeqw     xmm5, ZERO
221    pcmpeqw     xmm6, ZERO
222    pcmpeqw     xmm7, XMMWORD [VALUES + (56*2)]
223
224    packsswb    xmm0, xmm1
225    packsswb    xmm2, xmm3
226    packsswb    xmm4, xmm5
227    packsswb    xmm6, xmm7
228
229    pmovmskb    eax, xmm0
230    pmovmskb    ecx, xmm2
231    pmovmskb    edx, xmm4
232    pmovmskb    esi, xmm6
233
234    shl         ecx, 16
235    shl         esi, 16
236
237    or          eax, ecx
238    or          edx, esi
239
240    not         eax
241    not         edx
242
243    mov         edi, ZEROBITS
244
245    mov         INT [edi], eax
246    mov         INT [edi+SIZEOF_INT], edx
247%endmacro
248
249;
250; Prepare data for jsimd_encode_mcu_AC_first().
251;
252; GLOBAL(void)
253; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
254;                                        const int *jpeg_natural_order_start,
255;                                        int Sl, int Al, JCOEF *values,
256;                                        size_t *zerobits)
257;
258; eax + 8 = const JCOEF *block
259; eax + 12 = const int *jpeg_natural_order_start
260; eax + 16 = int Sl
261; eax + 20 = int Al
262; eax + 24 = JCOEF *values
263; eax + 28 = size_t *zerobits
264
265%define ZERO    xmm7
266%define X0      xmm0
267%define X1      xmm1
268%define N0      xmm2
269%define N1      xmm3
270%define AL      xmm4
271%define K       eax
272%define LENEND  eax
273%define LUT     ebx
274%define T0      ecx
275%define T1      edx
276%define BLOCK   esi
277%define VALUES  edi
278%define LEN     ebp
279
280%define ZEROBITS  INT [esp + 5 * 4]
281
282    align       32
283    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
284
285EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
286    push        ebp
287    mov         eax, esp                     ; eax = original ebp
288    sub         esp, byte 4
289    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
290    mov         [esp], eax
291    mov         ebp, esp                     ; ebp = aligned ebp
292    sub         esp, 4
293    push        ebx
294    push        ecx
295;   push        edx                     ; need not be preserved
296    push        esi
297    push        edi
298    push        ebp
299
300    mov         BLOCK, INT [eax + 8]
301    mov         LUT, INT [eax + 12]
302    mov         VALUES, INT [eax + 24]
303    movd        AL, INT [eax + 20]
304    mov         T0, INT [eax + 28]
305    mov         ZEROBITS, T0
306    mov         LEN, INT [eax + 16]
307    pxor        ZERO, ZERO
308    mov         K, LEN
309    and         K, -16
310    shr         K, 4
311    jz          .ELOOP16
312.BLOOP16:
313    LOAD16
314    pcmpgtw     N0, X0
315    pcmpgtw     N1, X1
316    paddw       X0, N0
317    paddw       X1, N1
318    pxor        X0, N0
319    pxor        X1, N1
320    psrlw       X0, AL
321    psrlw       X1, AL
322    pxor        N0, X0
323    pxor        N1, X1
324    movdqa      XMMWORD [VALUES + (0) * 2], X0
325    movdqa      XMMWORD [VALUES + (8) * 2], X1
326    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
327    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
328    add         VALUES, 16*2
329    add         LUT, 16*SIZEOF_INT
330    dec         K
331    jnz         .BLOOP16
332.ELOOP16:
333    mov         LENEND, LEN
334    and         LENEND, 7
335
336    test        LEN, 8
337    jz          .TRY7
338    test        LEN, 7
339    jz          .TRY8
340
341    LOAD15
342    pcmpgtw     N0, X0
343    pcmpgtw     N1, X1
344    paddw       X0, N0
345    paddw       X1, N1
346    pxor        X0, N0
347    pxor        X1, N1
348    psrlw       X0, AL
349    psrlw       X1, AL
350    pxor        N0, X0
351    pxor        N1, X1
352    movdqa      XMMWORD [VALUES + (0) * 2], X0
353    movdqa      XMMWORD [VALUES + (8) * 2], X1
354    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
355    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
356    add         VALUES, 16*2
357    jmp         .PADDING
358.TRY8:
359    LOAD8
360    pcmpgtw     N0, X0
361    paddw       X0, N0
362    pxor        X0, N0
363    psrlw       X0, AL
364    pxor        N0, X0
365    movdqa      XMMWORD [VALUES + (0) * 2], X0
366    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
367    add         VALUES, 8*2
368    jmp         .PADDING
369.TRY7:
370    LOAD7
371    pcmpgtw     N0, X0
372    paddw       X0, N0
373    pxor        X0, N0
374    psrlw       X0, AL
375    pxor        N0, X0
376    movdqa      XMMWORD [VALUES + (0) * 2], X0
377    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
378    add         VALUES, 8*2
379.PADDING:
380    mov         K, LEN
381    add         K, 7
382    and         K, -8
383    shr         K, 3
384    sub         K, DCTSIZE2/8
385    jz          .EPADDING
386    align       16
387.ZEROLOOP:
388    movdqa      XMMWORD [VALUES + 0], ZERO
389    add         VALUES, 8*2
390    inc         K
391    jnz         .ZEROLOOP
392.EPADDING:
393    sub         VALUES, DCTSIZE2*2
394
395    REDUCE0
396
397    pop         ebp
398    pop         edi
399    pop         esi
400;   pop         edx                     ; need not be preserved
401    pop         ecx
402    pop         ebx
403    mov         esp, ebp                ; esp <- aligned ebp
404    pop         esp                     ; esp <- original ebp
405    pop         ebp
406    ret
407
408%undef ZERO
409%undef X0
410%undef X1
411%undef N0
412%undef N1
413%undef AL
414%undef K
415%undef LUT
416%undef T0
417%undef T1
418%undef BLOCK
419%undef VALUES
420%undef LEN
421
422;
423; Prepare data for jsimd_encode_mcu_AC_refine().
424;
425; GLOBAL(int)
426; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
427;                                         const int *jpeg_natural_order_start,
428;                                         int Sl, int Al, JCOEF *absvalues,
429;                                         size_t *bits)
430;
431; eax + 8 = const JCOEF *block
432; eax + 12 = const int *jpeg_natural_order_start
433; eax + 16 = int Sl
434; eax + 20 = int Al
435; eax + 24 = JCOEF *values
436; eax + 28 = size_t *bits
437
438%define ZERO    xmm7
439%define ONE     xmm5
440%define X0      xmm0
441%define X1      xmm1
442%define N0      xmm2
443%define N1      xmm3
444%define AL      xmm4
445%define K       eax
446%define LENEND  eax
447%define LUT     ebx
448%define T0      ecx
449%define T0w      cx
450%define T1      edx
451%define BLOCK   esi
452%define VALUES  edi
453%define KK      ebp
454
455%define ZEROBITS  INT [esp + 5 * 4]
456%define EOB       INT [esp + 5 * 4 + 4]
457%define LEN       INT [esp + 5 * 4 + 8]
458
459    align       32
460    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
461
462EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
463    push        ebp
464    mov         eax, esp                     ; eax = original ebp
465    sub         esp, byte 4
466    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
467    mov         [esp], eax
468    mov         ebp, esp                     ; ebp = aligned ebp
469    sub         esp, 16
470    push        ebx
471    push        ecx
472;   push        edx                     ; need not be preserved
473    push        esi
474    push        edi
475    push        ebp
476
477    pcmpeqw     ONE, ONE
478    psrlw       ONE, 15
479    mov         BLOCK, INT [eax + 8]
480    mov         LUT, INT [eax + 12]
481    mov         VALUES, INT [eax + 24]
482    movd        AL, INT [eax + 20]
483    mov         T0, INT [eax + 28]
484    mov         K,  INT [eax + 16]
485    mov         INT [T0 + 2 * SIZEOF_INT], -1
486    mov         INT [T0 + 3 * SIZEOF_INT], -1
487    mov         ZEROBITS, T0
488    mov         LEN, K
489    pxor        ZERO, ZERO
490    and         K, -16
491    mov         EOB, 0
492    xor         KK, KK
493    shr         K, 4
494    jz          .ELOOPR16
495.BLOOPR16:
496    LOAD16
497    pcmpgtw     N0, X0
498    pcmpgtw     N1, X1
499    paddw       X0, N0
500    paddw       X1, N1
501    pxor        X0, N0
502    pxor        X1, N1
503    psrlw       X0, AL
504    psrlw       X1, AL
505    movdqa      XMMWORD [VALUES + (0) * 2], X0
506    movdqa      XMMWORD [VALUES + (8) * 2], X1
507    pcmpeqw     X0, ONE
508    pcmpeqw     X1, ONE
509    packsswb    N0, N1
510    packsswb    X0, X1
511    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
512    mov         T1, ZEROBITS
513    not         T0
514    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
515    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
516    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
517    jz          .CONTINUER16            ; if (idx) {
518    lea         T1, [T1+KK*8]
519    mov         EOB, T1                 ; EOB = k + idx;
520.CONTINUER16:
521    add         VALUES, 16*2
522    add         LUT, 16*SIZEOF_INT
523    add         KK, 2
524    dec         K
525    jnz         .BLOOPR16
526.ELOOPR16:
527    mov         LENEND, LEN
528
529    test        LENEND, 8
530    jz          .TRYR7
531    test        LENEND, 7
532    jz          .TRYR8
533
534    and         LENEND, 7
535    LOAD15
536    pcmpgtw     N0, X0
537    pcmpgtw     N1, X1
538    paddw       X0, N0
539    paddw       X1, N1
540    pxor        X0, N0
541    pxor        X1, N1
542    psrlw       X0, AL
543    psrlw       X1, AL
544    movdqa      XMMWORD [VALUES + (0) * 2], X0
545    movdqa      XMMWORD [VALUES + (8) * 2], X1
546    pcmpeqw     X0, ONE
547    pcmpeqw     X1, ONE
548    packsswb    N0, N1
549    packsswb    X0, X1
550    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
551    mov         T1, ZEROBITS
552    not         T0
553    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
554    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
555    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
556    jz          .CONTINUER15            ; if (idx) {
557    lea         T1, [T1+KK*8]
558    mov         EOB, T1                 ; EOB = k + idx;
559.CONTINUER15:
560    add         VALUES, 16*2
561    jmp         .PADDINGR
562.TRYR8:
563    LOAD8
564
565    pcmpgtw     N0, X0
566    paddw       X0, N0
567    pxor        X0, N0
568    psrlw       X0, AL
569    movdqa      XMMWORD [VALUES + (0) * 2], X0
570    pcmpeqw     X0, ONE
571    packsswb    N0, ZERO
572    packsswb    X0, ZERO
573    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
574    mov         T1, ZEROBITS
575    not         T0
576    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
577    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
578    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
579    jz          .CONTINUER8             ; if (idx) {
580    lea         T1, [T1+KK*8]
581    mov         EOB, T1                 ; EOB = k + idx;
582.CONTINUER8:
583    add         VALUES, 8*2
584    jmp         .PADDINGR
585.TRYR7:
586    and         LENEND, 7
587    LOAD7
588
589    pcmpgtw     N0, X0
590    paddw       X0, N0
591    pxor        X0, N0
592    psrlw       X0, AL
593    movdqa      XMMWORD [VALUES + (0) * 2], X0
594    pcmpeqw     X0, ONE
595    packsswb    N0, ZERO
596    packsswb    X0, ZERO
597    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
598    mov         T1, ZEROBITS
599    not         T0
600    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
601    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
602    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
603    jz          .CONTINUER7             ; if (idx) {
604    lea         T1, [T1+KK*8]
605    mov         EOB, T1                 ; EOB = k + idx;
606.CONTINUER7:
607    add         VALUES, 8*2
608.PADDINGR:
609    mov         K, LEN
610    add         K, 7
611    and         K, -8
612    shr         K, 3
613    sub         K, DCTSIZE2/8
614    jz          .EPADDINGR
615    align       16
616.ZEROLOOPR:
617    movdqa      XMMWORD [VALUES + 0], ZERO
618    add         VALUES, 8*2
619    inc         K
620    jnz         .ZEROLOOPR
621.EPADDINGR:
622    sub         VALUES, DCTSIZE2*2
623
624    REDUCE0
625
626    mov         eax, EOB
627
628    pop         ebp
629    pop         edi
630    pop         esi
631;   pop         edx                     ; need not be preserved
632    pop         ecx
633    pop         ebx
634    mov         esp, ebp                ; esp <- aligned ebp
635    pop         esp                     ; esp <- original ebp
636    pop         ebp
637    ret
638
639%undef ZERO
640%undef ONE
641%undef X0
642%undef X1
643%undef N0
644%undef N1
645%undef AL
646%undef K
647%undef KK
648%undef EOB
649%undef SIGN
650%undef LUT
651%undef T0
652%undef T1
653%undef BLOCK
654%undef VALUES
655%undef LEN
656%undef LENEND
657
658; For some reason, the OS X linker does not honor the request to align the
659; segment unless we do this.
660    align       32
661