1;
2; jdsample.asm - upsampling (64-bit SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright 2009 D. R. Commander
6;
7; Based on
8; x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17;
18; [TAB8]
19
20%include "jsimdext.inc"
21
22; --------------------------------------------------------------------------
23        SECTION SEG_CONST
24
25        alignz  16
26        global  EXTN(jconst_fancy_upsample_sse2)
27
28EXTN(jconst_fancy_upsample_sse2):
29
30PW_ONE          times 8 dw  1
31PW_TWO          times 8 dw  2
32PW_THREE        times 8 dw  3
33PW_SEVEN        times 8 dw  7
34PW_EIGHT        times 8 dw  8
35
36        alignz  16
37
38; --------------------------------------------------------------------------
39        SECTION SEG_TEXT
40        BITS    64
41;
42; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
43;
44; The upsampling algorithm is linear interpolation between pixel centers,
45; also known as a "triangle filter".  This is a good compromise between
46; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
47; of the way between input pixel centers.
48;
49; GLOBAL(void)
50; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
51;                                 JDIMENSION downsampled_width,
52;                                 JSAMPARRAY input_data,
53;                                 JSAMPARRAY * output_data_ptr);
54;
55
56; r10 = int max_v_samp_factor
57; r11 = JDIMENSION downsampled_width
58; r12 = JSAMPARRAY input_data
59; r13 = JSAMPARRAY * output_data_ptr
60
61        align   16
62        global  EXTN(jsimd_h2v1_fancy_upsample_sse2)
63
64EXTN(jsimd_h2v1_fancy_upsample_sse2):
65        push    rbp
66        mov     rax,rsp
67        mov     rbp,rsp
68        collect_args
69
70        mov     eax, r11d  ; colctr
71        test    rax,rax
72        jz      near .return
73
74        mov     rcx, r10        ; rowctr
75        test    rcx,rcx
76        jz      near .return
77
78        mov     rsi, r12        ; input_data
79        mov     rdi, r13
80        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
81.rowloop:
82        push    rax                     ; colctr
83        push    rdi
84        push    rsi
85
86        mov     rsi, JSAMPROW [rsi]     ; inptr
87        mov     rdi, JSAMPROW [rdi]     ; outptr
88
89        test    rax, SIZEOF_XMMWORD-1
90        jz      short .skip
91        mov     dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
92        mov     JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
93.skip:
94        pxor    xmm0,xmm0               ; xmm0=(all 0's)
95        pcmpeqb xmm7,xmm7
96        psrldq  xmm7,(SIZEOF_XMMWORD-1)
97        pand    xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
98
99        add     rax, byte SIZEOF_XMMWORD-1
100        and     rax, byte -SIZEOF_XMMWORD
101        cmp     rax, byte SIZEOF_XMMWORD
102        ja      short .columnloop
103
104.columnloop_last:
105        pcmpeqb xmm6,xmm6
106        pslldq  xmm6,(SIZEOF_XMMWORD-1)
107        pand    xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
108        jmp     short .upsample
109
110.columnloop:
111        movdqa  xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
112        pslldq  xmm6,(SIZEOF_XMMWORD-1)
113
114.upsample:
115        movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
116        movdqa  xmm2,xmm1
117        movdqa  xmm3,xmm1               ; xmm1=( 0  1  2 ... 13 14 15)
118        pslldq  xmm2,1                  ; xmm2=(--  0  1 ... 12 13 14)
119        psrldq  xmm3,1                  ; xmm3=( 1  2  3 ... 14 15 --)
120
121        por     xmm2,xmm7               ; xmm2=(-1  0  1 ... 12 13 14)
122        por     xmm3,xmm6               ; xmm3=( 1  2  3 ... 14 15 16)
123
124        movdqa  xmm7,xmm1
125        psrldq  xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
126
127        movdqa    xmm4,xmm1
128        punpcklbw xmm1,xmm0             ; xmm1=( 0  1  2  3  4  5  6  7)
129        punpckhbw xmm4,xmm0             ; xmm4=( 8  9 10 11 12 13 14 15)
130        movdqa    xmm5,xmm2
131        punpcklbw xmm2,xmm0             ; xmm2=(-1  0  1  2  3  4  5  6)
132        punpckhbw xmm5,xmm0             ; xmm5=( 7  8  9 10 11 12 13 14)
133        movdqa    xmm6,xmm3
134        punpcklbw xmm3,xmm0             ; xmm3=( 1  2  3  4  5  6  7  8)
135        punpckhbw xmm6,xmm0             ; xmm6=( 9 10 11 12 13 14 15 16)
136
137        pmullw  xmm1,[rel PW_THREE]
138        pmullw  xmm4,[rel PW_THREE]
139        paddw   xmm2,[rel PW_ONE]
140        paddw   xmm5,[rel PW_ONE]
141        paddw   xmm3,[rel PW_TWO]
142        paddw   xmm6,[rel PW_TWO]
143
144        paddw   xmm2,xmm1
145        paddw   xmm5,xmm4
146        psrlw   xmm2,2                  ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
147        psrlw   xmm5,2                  ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
148        paddw   xmm3,xmm1
149        paddw   xmm6,xmm4
150        psrlw   xmm3,2                  ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
151        psrlw   xmm6,2                  ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
152
153        psllw   xmm3,BYTE_BIT
154        psllw   xmm6,BYTE_BIT
155        por     xmm2,xmm3               ; xmm2=OutL=( 0  1  2 ... 13 14 15)
156        por     xmm5,xmm6               ; xmm5=OutH=(16 17 18 ... 29 30 31)
157
158        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
159        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
160
161        sub     rax, byte SIZEOF_XMMWORD
162        add     rsi, byte 1*SIZEOF_XMMWORD      ; inptr
163        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
164        cmp     rax, byte SIZEOF_XMMWORD
165        ja      near .columnloop
166        test    eax,eax
167        jnz     near .columnloop_last
168
169        pop     rsi
170        pop     rdi
171        pop     rax
172
173        add     rsi, byte SIZEOF_JSAMPROW       ; input_data
174        add     rdi, byte SIZEOF_JSAMPROW       ; output_data
175        dec     rcx                             ; rowctr
176        jg      near .rowloop
177
178.return:
179        uncollect_args
180        pop     rbp
181        ret
182
183; --------------------------------------------------------------------------
184;
185; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
186; Again a triangle filter; see comments for h2v1 case, above.
187;
188; GLOBAL(void)
189; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
190;                                 JDIMENSION downsampled_width,
191;                                 JSAMPARRAY input_data,
192;                                 JSAMPARRAY * output_data_ptr);
193;
194
195; r10 = int max_v_samp_factor
196; r11 = JDIMENSION downsampled_width
197; r12 = JSAMPARRAY input_data
198; r13 = JSAMPARRAY * output_data_ptr
199
200%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
201%define WK_NUM          4
202
203        align   16
204        global  EXTN(jsimd_h2v2_fancy_upsample_sse2)
205
206EXTN(jsimd_h2v2_fancy_upsample_sse2):
207        push    rbp
208        mov     rax,rsp                         ; rax = original rbp
209        sub     rsp, byte 4
210        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
211        mov     [rsp],rax
212        mov     rbp,rsp                         ; rbp = aligned rbp
213        lea     rsp, [wk(0)]
214        collect_args
215        push    rbx
216
217        mov     eax, r11d  ; colctr
218        test    rax,rax
219        jz      near .return
220
221        mov     rcx, r10        ; rowctr
222        test    rcx,rcx
223        jz      near .return
224
225        mov     rsi, r12        ; input_data
226        mov     rdi, r13
227        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
228.rowloop:
229        push    rax                                     ; colctr
230        push    rcx
231        push    rdi
232        push    rsi
233
234        mov     rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
235        mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; inptr0
236        mov     rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
237        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]   ; outptr0
238        mov     rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]   ; outptr1
239
240        test    rax, SIZEOF_XMMWORD-1
241        jz      short .skip
242        push    rdx
243        mov     dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
244        mov     JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
245        mov     dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
246        mov     JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
247        mov     dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
248        mov     JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
249        pop     rdx
250.skip:
251        ; -- process the first column block
252
253        movdqa  xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]    ; xmm0=row[ 0][0]
254        movdqa  xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]    ; xmm1=row[-1][0]
255        movdqa  xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]    ; xmm2=row[+1][0]
256
257        pxor      xmm3,xmm3             ; xmm3=(all 0's)
258        movdqa    xmm4,xmm0
259        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
260        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
261        movdqa    xmm5,xmm1
262        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
263        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
264        movdqa    xmm6,xmm2
265        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
266        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
267
268        pmullw  xmm0,[rel PW_THREE]
269        pmullw  xmm4,[rel PW_THREE]
270
271        pcmpeqb xmm7,xmm7
272        psrldq  xmm7,(SIZEOF_XMMWORD-2)
273
274        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
275        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
276        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
277        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
278
279        movdqa  XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1    ; temporarily save
280        movdqa  XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5    ; the intermediate data
281        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
282        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
283
284        pand    xmm1,xmm7               ; xmm1=( 0 -- -- -- -- -- -- --)
285        pand    xmm2,xmm7               ; xmm2=( 0 -- -- -- -- -- -- --)
286
287        movdqa  XMMWORD [wk(0)], xmm1
288        movdqa  XMMWORD [wk(1)], xmm2
289
290        add     rax, byte SIZEOF_XMMWORD-1
291        and     rax, byte -SIZEOF_XMMWORD
292        cmp     rax, byte SIZEOF_XMMWORD
293        ja      short .columnloop
294
295.columnloop_last:
296        ; -- process the last column block
297
298        pcmpeqb xmm1,xmm1
299        pslldq  xmm1,(SIZEOF_XMMWORD-2)
300        movdqa  xmm2,xmm1
301
302        pand    xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
303        pand    xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
304
305        movdqa  XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
306        movdqa  XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
307
308        jmp     near .upsample
309
310.columnloop:
311        ; -- process the next column block
312
313        movdqa  xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]    ; xmm0=row[ 0][1]
314        movdqa  xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]    ; xmm1=row[-1][1]
315        movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]    ; xmm2=row[+1][1]
316
317        pxor      xmm3,xmm3             ; xmm3=(all 0's)
318        movdqa    xmm4,xmm0
319        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
320        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
321        movdqa    xmm5,xmm1
322        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
323        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
324        movdqa    xmm6,xmm2
325        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
326        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
327
328        pmullw  xmm0,[rel PW_THREE]
329        pmullw  xmm4,[rel PW_THREE]
330
331        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
332        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
333        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
334        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
335
336        movdqa  XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1    ; temporarily save
337        movdqa  XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5    ; the intermediate data
338        movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
339        movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
340
341        pslldq  xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- --  0)
342        pslldq  xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- --  0)
343
344        movdqa  XMMWORD [wk(2)], xmm1
345        movdqa  XMMWORD [wk(3)], xmm2
346
347.upsample:
348        ; -- process the upper row
349
350        movdqa  xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
351        movdqa  xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
352
353        movdqa  xmm0,xmm7               ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
354        movdqa  xmm4,xmm3               ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
355        psrldq  xmm0,2                  ; xmm0=( 1  2  3  4  5  6  7 --)
356        pslldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- --  8)
357        movdqa  xmm5,xmm7
358        movdqa  xmm6,xmm3
359        psrldq  xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
360        pslldq  xmm6,2                  ; xmm6=(--  8  9 10 11 12 13 14)
361
362        por     xmm0,xmm4               ; xmm0=( 1  2  3  4  5  6  7  8)
363        por     xmm5,xmm6               ; xmm5=( 7  8  9 10 11 12 13 14)
364
365        movdqa  xmm1,xmm7
366        movdqa  xmm2,xmm3
367        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
368        psrldq  xmm2,2                  ; xmm2=( 9 10 11 12 13 14 15 --)
369        movdqa  xmm4,xmm3
370        psrldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
371
372        por     xmm1, XMMWORD [wk(0)]   ; xmm1=(-1  0  1  2  3  4  5  6)
373        por     xmm2, XMMWORD [wk(2)]   ; xmm2=( 9 10 11 12 13 14 15 16)
374
375        movdqa  XMMWORD [wk(0)], xmm4
376
377        pmullw  xmm7,[rel PW_THREE]
378        pmullw  xmm3,[rel PW_THREE]
379        paddw   xmm1,[rel PW_EIGHT]
380        paddw   xmm5,[rel PW_EIGHT]
381        paddw   xmm0,[rel PW_SEVEN]
382        paddw   xmm2,[rel PW_SEVEN]
383
384        paddw   xmm1,xmm7
385        paddw   xmm5,xmm3
386        psrlw   xmm1,4                  ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
387        psrlw   xmm5,4                  ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
388        paddw   xmm0,xmm7
389        paddw   xmm2,xmm3
390        psrlw   xmm0,4                  ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
391        psrlw   xmm2,4                  ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
392
393        psllw   xmm0,BYTE_BIT
394        psllw   xmm2,BYTE_BIT
395        por     xmm1,xmm0               ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
396        por     xmm5,xmm2               ; xmm5=Out0H=(16 17 18 ... 29 30 31)
397
398        movdqa  XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
399        movdqa  XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
400
401        ; -- process the lower row
402
403        movdqa  xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
404        movdqa  xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
405
406        movdqa  xmm7,xmm6               ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
407        movdqa  xmm3,xmm4               ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
408        psrldq  xmm7,2                  ; xmm7=( 1  2  3  4  5  6  7 --)
409        pslldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- --  8)
410        movdqa  xmm0,xmm6
411        movdqa  xmm2,xmm4
412        psrldq  xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
413        pslldq  xmm2,2                  ; xmm2=(--  8  9 10 11 12 13 14)
414
415        por     xmm7,xmm3               ; xmm7=( 1  2  3  4  5  6  7  8)
416        por     xmm0,xmm2               ; xmm0=( 7  8  9 10 11 12 13 14)
417
418        movdqa  xmm1,xmm6
419        movdqa  xmm5,xmm4
420        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
421        psrldq  xmm5,2                  ; xmm5=( 9 10 11 12 13 14 15 --)
422        movdqa  xmm3,xmm4
423        psrldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
424
425        por     xmm1, XMMWORD [wk(1)]   ; xmm1=(-1  0  1  2  3  4  5  6)
426        por     xmm5, XMMWORD [wk(3)]   ; xmm5=( 9 10 11 12 13 14 15 16)
427
428        movdqa  XMMWORD [wk(1)], xmm3
429
430        pmullw  xmm6,[rel PW_THREE]
431        pmullw  xmm4,[rel PW_THREE]
432        paddw   xmm1,[rel PW_EIGHT]
433        paddw   xmm0,[rel PW_EIGHT]
434        paddw   xmm7,[rel PW_SEVEN]
435        paddw   xmm5,[rel PW_SEVEN]
436
437        paddw   xmm1,xmm6
438        paddw   xmm0,xmm4
439        psrlw   xmm1,4                  ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
440        psrlw   xmm0,4                  ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
441        paddw   xmm7,xmm6
442        paddw   xmm5,xmm4
443        psrlw   xmm7,4                  ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
444        psrlw   xmm5,4                  ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
445
446        psllw   xmm7,BYTE_BIT
447        psllw   xmm5,BYTE_BIT
448        por     xmm1,xmm7               ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
449        por     xmm0,xmm5               ; xmm0=Out1H=(16 17 18 ... 29 30 31)
450
451        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
452        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
453
454        sub     rax, byte SIZEOF_XMMWORD
455        add     rcx, byte 1*SIZEOF_XMMWORD      ; inptr1(above)
456        add     rbx, byte 1*SIZEOF_XMMWORD      ; inptr0
457        add     rsi, byte 1*SIZEOF_XMMWORD      ; inptr1(below)
458        add     rdx, byte 2*SIZEOF_XMMWORD      ; outptr0
459        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr1
460        cmp     rax, byte SIZEOF_XMMWORD
461        ja      near .columnloop
462        test    rax,rax
463        jnz     near .columnloop_last
464
465        pop     rsi
466        pop     rdi
467        pop     rcx
468        pop     rax
469
470        add     rsi, byte 1*SIZEOF_JSAMPROW     ; input_data
471        add     rdi, byte 2*SIZEOF_JSAMPROW     ; output_data
472        sub     rcx, byte 2                     ; rowctr
473        jg      near .rowloop
474
475.return:
476        pop     rbx
477        uncollect_args
478        mov     rsp,rbp         ; rsp <- aligned rbp
479        pop     rsp             ; rsp <- original rbp
480        pop     rbp
481        ret
482
483; --------------------------------------------------------------------------
484;
485; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
486; It's still a box filter.
487;
488; GLOBAL(void)
489; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
490;                           JDIMENSION output_width,
491;                           JSAMPARRAY input_data,
492;                           JSAMPARRAY * output_data_ptr);
493;
494
495; r10 = int max_v_samp_factor
496; r11 = JDIMENSION output_width
497; r12 = JSAMPARRAY input_data
498; r13 = JSAMPARRAY * output_data_ptr
499
500        align   16
501        global  EXTN(jsimd_h2v1_upsample_sse2)
502
503EXTN(jsimd_h2v1_upsample_sse2):
504        push    rbp
505        mov     rax,rsp
506        mov     rbp,rsp
507        collect_args
508
509        mov     edx, r11d
510        add     rdx, byte (2*SIZEOF_XMMWORD)-1
511        and     rdx, byte -(2*SIZEOF_XMMWORD)
512        jz      near .return
513
514        mov     rcx, r10        ; rowctr
515        test    rcx,rcx
516        jz      short .return
517
518        mov     rsi, r12 ; input_data
519        mov     rdi, r13
520        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
521.rowloop:
522        push    rdi
523        push    rsi
524
525        mov     rsi, JSAMPROW [rsi]             ; inptr
526        mov     rdi, JSAMPROW [rdi]             ; outptr
527        mov     rax,rdx                         ; colctr
528.columnloop:
529
530        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
531
532        movdqa    xmm1,xmm0
533        punpcklbw xmm0,xmm0
534        punpckhbw xmm1,xmm1
535
536        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
537        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
538
539        sub     rax, byte 2*SIZEOF_XMMWORD
540        jz      short .nextrow
541
542        movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
543
544        movdqa    xmm3,xmm2
545        punpcklbw xmm2,xmm2
546        punpckhbw xmm3,xmm3
547
548        movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
549        movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
550
551        sub     rax, byte 2*SIZEOF_XMMWORD
552        jz      short .nextrow
553
554        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
555        add     rdi, byte 4*SIZEOF_XMMWORD      ; outptr
556        jmp     short .columnloop
557
558.nextrow:
559        pop     rsi
560        pop     rdi
561
562        add     rsi, byte SIZEOF_JSAMPROW       ; input_data
563        add     rdi, byte SIZEOF_JSAMPROW       ; output_data
564        dec     rcx                             ; rowctr
565        jg      short .rowloop
566
567.return:
568        uncollect_args
569        pop     rbp
570        ret
571
572; --------------------------------------------------------------------------
573;
574; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
575; It's still a box filter.
576;
577; GLOBAL(void)
578; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
579;                           JDIMENSION output_width,
580;                           JSAMPARRAY input_data,
581;                           JSAMPARRAY * output_data_ptr);
582;
583
584; r10 = int max_v_samp_factor
585; r11 = JDIMENSION output_width
586; r12 = JSAMPARRAY input_data
587; r13 = JSAMPARRAY * output_data_ptr
588
589        align   16
590        global  EXTN(jsimd_h2v2_upsample_sse2)
591
592EXTN(jsimd_h2v2_upsample_sse2):
593        push    rbp
594        mov     rax,rsp
595        mov     rbp,rsp
596        collect_args
597        push    rbx
598
599        mov     edx, r11d
600        add     rdx, byte (2*SIZEOF_XMMWORD)-1
601        and     rdx, byte -(2*SIZEOF_XMMWORD)
602        jz      near .return
603
604        mov     rcx, r10        ; rowctr
605        test    rcx,rcx
606        jz      near .return
607
608        mov     rsi, r12        ; input_data
609        mov     rdi, r13
610        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
611.rowloop:
612        push    rdi
613        push    rsi
614
615        mov     rsi, JSAMPROW [rsi]                     ; inptr
616        mov     rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]   ; outptr0
617        mov     rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]   ; outptr1
618        mov     rax,rdx                                 ; colctr
619.columnloop:
620
621        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
622
623        movdqa    xmm1,xmm0
624        punpcklbw xmm0,xmm0
625        punpckhbw xmm1,xmm1
626
627        movdqa  XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
628        movdqa  XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
629        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
630        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
631
632        sub     rax, byte 2*SIZEOF_XMMWORD
633        jz      short .nextrow
634
635        movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
636
637        movdqa    xmm3,xmm2
638        punpcklbw xmm2,xmm2
639        punpckhbw xmm3,xmm3
640
641        movdqa  XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
642        movdqa  XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
643        movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
644        movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
645
646        sub     rax, byte 2*SIZEOF_XMMWORD
647        jz      short .nextrow
648
649        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
650        add     rbx, byte 4*SIZEOF_XMMWORD      ; outptr0
651        add     rdi, byte 4*SIZEOF_XMMWORD      ; outptr1
652        jmp     short .columnloop
653
654.nextrow:
655        pop     rsi
656        pop     rdi
657
658        add     rsi, byte 1*SIZEOF_JSAMPROW     ; input_data
659        add     rdi, byte 2*SIZEOF_JSAMPROW     ; output_data
660        sub     rcx, byte 2                     ; rowctr
661        jg      near .rowloop
662
663.return:
664        pop     rbx
665        uncollect_args
666        pop     rbp
667        ret
668
669; For some reason, the OS X linker does not honor the request to align the
670; segment unless we do this.
671        align   16
672