1;
2; jcphuff-sse2.asm - prepare data for progressive Huffman encoding
3; (64-bit SSE2)
4;
5; Copyright (C) 2016, 2018, Matthieu Darbois
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; This file contains an SSE2 implementation of data preparation for progressive
18; Huffman encoding.  See jcphuff.c for more details.
19;
20; [TAB8]
21
22%include "jsimdext.inc"
23
24; --------------------------------------------------------------------------
25    SECTION     SEG_TEXT
26    BITS        64
27
28; --------------------------------------------------------------------------
29; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
30; jsimd_encode_mcu_AC_refine_prepare_sse2()
31
32%macro LOAD16 0
33    pxor        N0, N0
34    pxor        N1, N1
35
36    mov         T0d, INT [LUT +  0*SIZEOF_INT]
37    mov         T1d, INT [LUT +  8*SIZEOF_INT]
38    pinsrw      X0, word [BLOCK + T0 * 2], 0
39    pinsrw      X1, word [BLOCK + T1 * 2], 0
40
41    mov         T0d, INT [LUT +  1*SIZEOF_INT]
42    mov         T1d, INT [LUT +  9*SIZEOF_INT]
43    pinsrw      X0, word [BLOCK + T0 * 2], 1
44    pinsrw      X1, word [BLOCK + T1 * 2], 1
45
46    mov         T0d, INT [LUT +  2*SIZEOF_INT]
47    mov         T1d, INT [LUT + 10*SIZEOF_INT]
48    pinsrw      X0, word [BLOCK + T0 * 2], 2
49    pinsrw      X1, word [BLOCK + T1 * 2], 2
50
51    mov         T0d, INT [LUT +  3*SIZEOF_INT]
52    mov         T1d, INT [LUT + 11*SIZEOF_INT]
53    pinsrw      X0, word [BLOCK + T0 * 2], 3
54    pinsrw      X1, word [BLOCK + T1 * 2], 3
55
56    mov         T0d, INT [LUT +  4*SIZEOF_INT]
57    mov         T1d, INT [LUT + 12*SIZEOF_INT]
58    pinsrw      X0, word [BLOCK + T0 * 2], 4
59    pinsrw      X1, word [BLOCK + T1 * 2], 4
60
61    mov         T0d, INT [LUT +  5*SIZEOF_INT]
62    mov         T1d, INT [LUT + 13*SIZEOF_INT]
63    pinsrw      X0, word [BLOCK + T0 * 2], 5
64    pinsrw      X1, word [BLOCK + T1 * 2], 5
65
66    mov         T0d, INT [LUT +  6*SIZEOF_INT]
67    mov         T1d, INT [LUT + 14*SIZEOF_INT]
68    pinsrw      X0, word [BLOCK + T0 * 2], 6
69    pinsrw      X1, word [BLOCK + T1 * 2], 6
70
71    mov         T0d, INT [LUT +  7*SIZEOF_INT]
72    mov         T1d, INT [LUT + 15*SIZEOF_INT]
73    pinsrw      X0, word [BLOCK + T0 * 2], 7
74    pinsrw      X1, word [BLOCK + T1 * 2], 7
75%endmacro
76
77%macro LOAD15 0
78    pxor        N0, N0
79    pxor        N1, N1
80    pxor        X1, X1
81
82    mov         T0d, INT [LUT +  0*SIZEOF_INT]
83    mov         T1d, INT [LUT +  8*SIZEOF_INT]
84    pinsrw      X0, word [BLOCK + T0 * 2], 0
85    pinsrw      X1, word [BLOCK + T1 * 2], 0
86
87    mov         T0d, INT [LUT +  1*SIZEOF_INT]
88    pinsrw      X0, word [BLOCK + T0 * 2], 1
89
90    mov         T0d, INT [LUT +  2*SIZEOF_INT]
91    pinsrw      X0, word [BLOCK + T0 * 2], 2
92
93    mov         T0d, INT [LUT +  3*SIZEOF_INT]
94    pinsrw      X0, word [BLOCK + T0 * 2], 3
95
96    mov         T0d, INT [LUT +  4*SIZEOF_INT]
97    pinsrw      X0, word [BLOCK + T0 * 2], 4
98
99    mov         T0d, INT [LUT +  5*SIZEOF_INT]
100    pinsrw      X0, word [BLOCK + T0 * 2], 5
101
102    mov         T0d, INT [LUT +  6*SIZEOF_INT]
103    pinsrw      X0, word [BLOCK + T0 * 2], 6
104
105    mov         T0d, INT [LUT +  7*SIZEOF_INT]
106    pinsrw      X0, word [BLOCK + T0 * 2], 7
107
108    cmp         LENEND, 2
109    jl          %%.ELOAD15
110    mov         T1d, INT [LUT +  9*SIZEOF_INT]
111    pinsrw      X1, word [BLOCK + T1 * 2], 1
112
113    cmp         LENEND, 3
114    jl          %%.ELOAD15
115    mov         T1d, INT [LUT + 10*SIZEOF_INT]
116    pinsrw      X1, word [BLOCK + T1 * 2], 2
117
118    cmp         LENEND, 4
119    jl          %%.ELOAD15
120    mov         T1d, INT [LUT + 11*SIZEOF_INT]
121    pinsrw      X1, word [BLOCK + T1 * 2], 3
122
123    cmp         LENEND, 5
124    jl          %%.ELOAD15
125    mov         T1d, INT [LUT + 12*SIZEOF_INT]
126    pinsrw      X1, word [BLOCK + T1 * 2], 4
127
128    cmp         LENEND, 6
129    jl          %%.ELOAD15
130    mov         T1d, INT [LUT + 13*SIZEOF_INT]
131    pinsrw      X1, word [BLOCK + T1 * 2], 5
132
133    cmp         LENEND, 7
134    jl          %%.ELOAD15
135    mov         T1d, INT [LUT + 14*SIZEOF_INT]
136    pinsrw      X1, word [BLOCK + T1 * 2], 6
137%%.ELOAD15:
138%endmacro
139
140%macro LOAD8 0
141    pxor        N0, N0
142
143    mov         T0d, INT [LUT +  0*SIZEOF_INT]
144    pinsrw      X0, word [BLOCK + T0 * 2], 0
145
146    mov         T0d, INT [LUT +  1*SIZEOF_INT]
147    pinsrw      X0, word [BLOCK + T0 * 2], 1
148
149    mov         T0d, INT [LUT +  2*SIZEOF_INT]
150    pinsrw      X0, word [BLOCK + T0 * 2], 2
151
152    mov         T0d, INT [LUT +  3*SIZEOF_INT]
153    pinsrw      X0, word [BLOCK + T0 * 2], 3
154
155    mov         T0d, INT [LUT +  4*SIZEOF_INT]
156    pinsrw      X0, word [BLOCK + T0 * 2], 4
157
158    mov         T0d, INT [LUT +  5*SIZEOF_INT]
159    pinsrw      X0, word [BLOCK + T0 * 2], 5
160
161    mov         T0d, INT [LUT +  6*SIZEOF_INT]
162    pinsrw      X0, word [BLOCK + T0 * 2], 6
163
164    mov         T0d, INT [LUT +  7*SIZEOF_INT]
165    pinsrw      X0, word [BLOCK + T0 * 2], 7
166%endmacro
167
168%macro LOAD7 0
169    pxor        N0, N0
170    pxor        X0, X0
171
172    mov         T1d, INT [LUT +  0*SIZEOF_INT]
173    pinsrw      X0, word [BLOCK + T1 * 2], 0
174
175    cmp         LENEND, 2
176    jl          %%.ELOAD7
177    mov         T1d, INT [LUT +  1*SIZEOF_INT]
178    pinsrw      X0, word [BLOCK + T1 * 2], 1
179
180    cmp         LENEND, 3
181    jl          %%.ELOAD7
182    mov         T1d, INT [LUT +  2*SIZEOF_INT]
183    pinsrw      X0, word [BLOCK + T1 * 2], 2
184
185    cmp         LENEND, 4
186    jl          %%.ELOAD7
187    mov         T1d, INT [LUT +  3*SIZEOF_INT]
188    pinsrw      X0, word [BLOCK + T1 * 2], 3
189
190    cmp         LENEND, 5
191    jl          %%.ELOAD7
192    mov         T1d, INT [LUT +  4*SIZEOF_INT]
193    pinsrw      X0, word [BLOCK + T1 * 2], 4
194
195    cmp         LENEND, 6
196    jl          %%.ELOAD7
197    mov         T1d, INT [LUT +  5*SIZEOF_INT]
198    pinsrw      X0, word [BLOCK + T1 * 2], 5
199
200    cmp         LENEND, 7
201    jl          %%.ELOAD7
202    mov         T1d, INT [LUT +  6*SIZEOF_INT]
203    pinsrw      X0, word [BLOCK + T1 * 2], 6
204%%.ELOAD7:
205%endmacro
206
207%macro REDUCE0 0
208    movdqa      xmm0, XMMWORD [VALUES + ( 0*2)]
209    movdqa      xmm1, XMMWORD [VALUES + ( 8*2)]
210    movdqa      xmm2, XMMWORD [VALUES + (16*2)]
211    movdqa      xmm3, XMMWORD [VALUES + (24*2)]
212    movdqa      xmm4, XMMWORD [VALUES + (32*2)]
213    movdqa      xmm5, XMMWORD [VALUES + (40*2)]
214    movdqa      xmm6, XMMWORD [VALUES + (48*2)]
215    movdqa      xmm7, XMMWORD [VALUES + (56*2)]
216
217    pcmpeqw     xmm0, ZERO
218    pcmpeqw     xmm1, ZERO
219    pcmpeqw     xmm2, ZERO
220    pcmpeqw     xmm3, ZERO
221    pcmpeqw     xmm4, ZERO
222    pcmpeqw     xmm5, ZERO
223    pcmpeqw     xmm6, ZERO
224    pcmpeqw     xmm7, ZERO
225
226    packsswb    xmm0, xmm1
227    packsswb    xmm2, xmm3
228    packsswb    xmm4, xmm5
229    packsswb    xmm6, xmm7
230
231    pmovmskb    eax, xmm0
232    pmovmskb    ecx, xmm2
233    pmovmskb    edx, xmm4
234    pmovmskb    esi, xmm6
235
236    shl         rcx, 16
237    shl         rdx, 32
238    shl         rsi, 48
239
240    or          rax, rcx
241    or          rdx, rsi
242    or          rax, rdx
243
244    not         rax
245
246    mov         MMWORD [r15], rax
247%endmacro
248
249;
250; Prepare data for jsimd_encode_mcu_AC_first().
251;
252; GLOBAL(void)
253; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
254;                                        const int *jpeg_natural_order_start,
255;                                        int Sl, int Al, JCOEF *values,
256;                                        size_t *zerobits)
257;
258; r10 = const JCOEF *block
259; r11 = const int *jpeg_natural_order_start
260; r12 = int Sl
261; r13 = int Al
262; r14 = JCOEF *values
263; r15 = size_t *zerobits
264
265%define ZERO    xmm9
266%define X0      xmm0
267%define X1      xmm1
268%define N0      xmm2
269%define N1      xmm3
270%define AL      xmm4
271%define K       eax
272%define LUT     r11
273%define T0      rcx
274%define T0d     ecx
275%define T1      rdx
276%define T1d     edx
277%define BLOCK   r10
278%define VALUES  r14
279%define LEN     r12d
280%define LENEND  r13d
281
282    align       32
283    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
284
285EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
286    push        rbp
287    mov         rax, rsp                     ; rax = original rbp
288    sub         rsp, byte 4
289    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
290    mov         [rsp], rax
291    mov         rbp, rsp                     ; rbp = aligned rbp
292    lea         rsp, [rbp - 16]
293    collect_args 6
294
295    movdqa      XMMWORD [rbp - 16], ZERO
296
297    movd        AL, r13d
298    pxor        ZERO, ZERO
299    mov         K, LEN
300    mov         LENEND, LEN
301    and         K, -16
302    and         LENEND, 7
303    shr         K, 4
304    jz          .ELOOP16
305.BLOOP16:
306    LOAD16
307    pcmpgtw     N0, X0
308    pcmpgtw     N1, X1
309    paddw       X0, N0
310    paddw       X1, N1
311    pxor        X0, N0
312    pxor        X1, N1
313    psrlw       X0, AL
314    psrlw       X1, AL
315    pxor        N0, X0
316    pxor        N1, X1
317    movdqa      XMMWORD [VALUES + (0) * 2], X0
318    movdqa      XMMWORD [VALUES + (8) * 2], X1
319    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
320    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
321    add         VALUES, 16*2
322    add         LUT, 16*SIZEOF_INT
323    dec         K
324    jnz         .BLOOP16
325.ELOOP16:
326    test        LEN, 8
327    jz          .TRY7
328    test        LEN, 7
329    jz          .TRY8
330
331    LOAD15
332    pcmpgtw     N0, X0
333    pcmpgtw     N1, X1
334    paddw       X0, N0
335    paddw       X1, N1
336    pxor        X0, N0
337    pxor        X1, N1
338    psrlw       X0, AL
339    psrlw       X1, AL
340    pxor        N0, X0
341    pxor        N1, X1
342    movdqa      XMMWORD [VALUES + (0) * 2], X0
343    movdqa      XMMWORD [VALUES + (8) * 2], X1
344    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
345    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
346    add         VALUES, 16*2
347    jmp         .PADDING
348.TRY8:
349    LOAD8
350    pcmpgtw     N0, X0
351    paddw       X0, N0
352    pxor        X0, N0
353    psrlw       X0, AL
354    pxor        N0, X0
355    movdqa      XMMWORD [VALUES + (0) * 2], X0
356    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
357    add         VALUES, 8*2
358    jmp         .PADDING
359.TRY7:
360    LOAD7
361    pcmpgtw     N0, X0
362    paddw       X0, N0
363    pxor        X0, N0
364    psrlw       X0, AL
365    pxor        N0, X0
366    movdqa      XMMWORD [VALUES + (0) * 2], X0
367    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
368    add         VALUES, 8*2
369.PADDING:
370    mov         K, LEN
371    add         K, 7
372    and         K, -8
373    shr         K, 3
374    sub         K, DCTSIZE2/8
375    jz          .EPADDING
376    align       16
377.ZEROLOOP:
378    movdqa      XMMWORD [VALUES + 0], ZERO
379    add         VALUES, 8*2
380    inc         K
381    jnz         .ZEROLOOP
382.EPADDING:
383    sub         VALUES, DCTSIZE2*2
384
385    REDUCE0
386
387    movdqa      ZERO, XMMWORD [rbp - 16]
388    uncollect_args 6
389    mov         rsp, rbp                ; rsp <- aligned rbp
390    pop         rsp                     ; rsp <- original rbp
391    pop         rbp
392    ret
393
394%undef ZERO
395%undef X0
396%undef X1
397%undef N0
398%undef N1
399%undef AL
400%undef K
401%undef LUT
402%undef T0
403%undef T0d
404%undef T1
405%undef T1d
406%undef BLOCK
407%undef VALUES
408%undef LEN
409%undef LENEND
410
411;
412; Prepare data for jsimd_encode_mcu_AC_refine().
413;
414; GLOBAL(int)
415; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
416;                                         const int *jpeg_natural_order_start,
417;                                         int Sl, int Al, JCOEF *absvalues,
418;                                         size_t *bits)
419;
420; r10 = const JCOEF *block
421; r11 = const int *jpeg_natural_order_start
422; r12 = int Sl
423; r13 = int Al
424; r14 = JCOEF *values
425; r15 = size_t *bits
426
427%define ZERO    xmm9
428%define ONE     xmm5
429%define X0      xmm0
430%define X1      xmm1
431%define N0      xmm2
432%define N1      xmm3
433%define AL      xmm4
434%define K       eax
435%define KK      r9d
436%define EOB     r8d
437%define SIGN    rdi
438%define LUT     r11
439%define T0      rcx
440%define T0d     ecx
441%define T1      rdx
442%define T1d     edx
443%define BLOCK   r10
444%define VALUES  r14
445%define LEN     r12d
446%define LENEND  r13d
447
448    align       32
449    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
450
451EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
452    push        rbp
453    mov         rax, rsp                     ; rax = original rbp
454    sub         rsp, byte 4
455    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
456    mov         [rsp], rax
457    mov         rbp, rsp                     ; rbp = aligned rbp
458    lea         rsp, [rbp - 16]
459    collect_args 6
460
461    movdqa      XMMWORD [rbp - 16], ZERO
462
463    xor         SIGN, SIGN
464    xor         EOB, EOB
465    xor         KK, KK
466    movd        AL, r13d
467    pxor        ZERO, ZERO
468    pcmpeqw     ONE, ONE
469    psrlw       ONE, 15
470    mov         K, LEN
471    mov         LENEND, LEN
472    and         K, -16
473    and         LENEND, 7
474    shr         K, 4
475    jz          .ELOOPR16
476.BLOOPR16:
477    LOAD16
478    pcmpgtw     N0, X0
479    pcmpgtw     N1, X1
480    paddw       X0, N0
481    paddw       X1, N1
482    pxor        X0, N0
483    pxor        X1, N1
484    psrlw       X0, AL
485    psrlw       X1, AL
486    movdqa      XMMWORD [VALUES + (0) * 2], X0
487    movdqa      XMMWORD [VALUES + (8) * 2], X1
488    pcmpeqw     X0, ONE
489    pcmpeqw     X1, ONE
490    packsswb    N0, N1
491    packsswb    X0, X1
492    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
493    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
494    shr         SIGN, 16                ; make room for sizebits
495    shl         T0, 48
496    or          SIGN, T0
497    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
498    jz          .CONTINUER16            ; if (idx) {
499    mov         EOB, KK
500    add         EOB, T1d                ; EOB = k + idx;
501.CONTINUER16:
502    add         VALUES, 16*2
503    add         LUT, 16*SIZEOF_INT
504    add         KK, 16
505    dec         K
506    jnz         .BLOOPR16
507.ELOOPR16:
508    test        LEN, 8
509    jz          .TRYR7
510    test        LEN, 7
511    jz          .TRYR8
512
513    LOAD15
514    pcmpgtw     N0, X0
515    pcmpgtw     N1, X1
516    paddw       X0, N0
517    paddw       X1, N1
518    pxor        X0, N0
519    pxor        X1, N1
520    psrlw       X0, AL
521    psrlw       X1, AL
522    movdqa      XMMWORD [VALUES + (0) * 2], X0
523    movdqa      XMMWORD [VALUES + (8) * 2], X1
524    pcmpeqw     X0, ONE
525    pcmpeqw     X1, ONE
526    packsswb    N0, N1
527    packsswb    X0, X1
528    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
529    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
530    shr         SIGN, 16                ; make room for sizebits
531    shl         T0, 48
532    or          SIGN, T0
533    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
534    jz          .CONTINUER15            ; if (idx) {
535    mov         EOB, KK
536    add         EOB, T1d                ; EOB = k + idx;
537.CONTINUER15:
538    add         VALUES, 16*2
539    jmp         .PADDINGR
540.TRYR8:
541    LOAD8
542
543    pcmpgtw     N0, X0
544    paddw       X0, N0
545    pxor        X0, N0
546    psrlw       X0, AL
547    movdqa      XMMWORD [VALUES + (0) * 2], X0
548    pcmpeqw     X0, ONE
549    packsswb    N0, ZERO
550    packsswb    X0, ZERO
551    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
552    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
553    shr         SIGN, 8                 ; make room for sizebits
554    shl         T0, 56
555    or          SIGN, T0
556    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
557    jz          .CONTINUER8             ; if (idx) {
558    mov         EOB, KK
559    add         EOB, T1d                ; EOB = k + idx;
560.CONTINUER8:
561    add         VALUES, 8*2
562    jmp         .PADDINGR
563.TRYR7:
564    LOAD7
565
566    pcmpgtw     N0, X0
567    paddw       X0, N0
568    pxor        X0, N0
569    psrlw       X0, AL
570    movdqa      XMMWORD [VALUES + (0) * 2], X0
571    pcmpeqw     X0, ONE
572    packsswb    N0, ZERO
573    packsswb    X0, ZERO
574    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
575    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
576    shr         SIGN, 8                 ; make room for sizebits
577    shl         T0, 56
578    or          SIGN, T0
579    bsr         T1d, T1d                ; idx = 16 - (__builtin_clz(idx)>>1);
580    jz          .CONTINUER7             ; if (idx) {
581    mov         EOB, KK
582    add         EOB, T1d                ; EOB = k + idx;
583.CONTINUER7:
584    add         VALUES, 8*2
585.PADDINGR:
586    mov         K, LEN
587    add         K, 7
588    and         K, -8
589    shr         K, 3
590    sub         K, DCTSIZE2/8
591    jz          .EPADDINGR
592    align       16
593.ZEROLOOPR:
594    movdqa      XMMWORD [VALUES + 0], ZERO
595    shr         SIGN, 8
596    add         VALUES, 8*2
597    inc         K
598    jnz         .ZEROLOOPR
599.EPADDINGR:
600    not         SIGN
601    sub         VALUES, DCTSIZE2*2
602    mov         MMWORD [r15+SIZEOF_MMWORD], SIGN
603
604    REDUCE0
605
606    mov         eax, EOB
607    movdqa      ZERO, XMMWORD [rbp - 16]
608    uncollect_args 6
609    mov         rsp, rbp                ; rsp <- aligned rbp
610    pop         rsp                     ; rsp <- original rbp
611    pop         rbp
612    ret
613
614%undef ZERO
615%undef ONE
616%undef X0
617%undef X1
618%undef N0
619%undef N1
620%undef AL
621%undef K
622%undef KK
623%undef EOB
624%undef SIGN
625%undef LUT
626%undef T0
627%undef T0d
628%undef T1
629%undef T1d
630%undef BLOCK
631%undef VALUES
632%undef LEN
633%undef LENEND
634
635; For some reason, the OS X linker does not honor the request to align the
636; segment unless we do this.
637    align       32
638