1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;macro in deblock functions
15%macro FIRST_2_ROWS 0
16        movdqa      xmm4,       xmm0
17        movdqa      xmm6,       xmm0
18        movdqa      xmm5,       xmm1
19        pavgb       xmm5,       xmm3
20
21        ;calculate absolute value
22        psubusb     xmm4,       xmm1
23        psubusb     xmm1,       xmm0
24        psubusb     xmm6,       xmm3
25        psubusb     xmm3,       xmm0
26        paddusb     xmm4,       xmm1
27        paddusb     xmm6,       xmm3
28
29        ;get threshold
30        movdqa      xmm2,       flimit
31        pxor        xmm1,       xmm1
32        movdqa      xmm7,       xmm2
33
34        ;get mask
35        psubusb     xmm2,       xmm4
36        psubusb     xmm7,       xmm6
37        pcmpeqb     xmm2,       xmm1
38        pcmpeqb     xmm7,       xmm1
39        por         xmm7,       xmm2
40%endmacro
41
42%macro SECOND_2_ROWS 0
43        movdqa      xmm6,       xmm0
44        movdqa      xmm4,       xmm0
45        movdqa      xmm2,       xmm1
46        pavgb       xmm1,       xmm3
47
48        ;calculate absolute value
49        psubusb     xmm6,       xmm2
50        psubusb     xmm2,       xmm0
51        psubusb     xmm4,       xmm3
52        psubusb     xmm3,       xmm0
53        paddusb     xmm6,       xmm2
54        paddusb     xmm4,       xmm3
55
56        pavgb       xmm5,       xmm1
57
58        ;get threshold
59        movdqa      xmm2,       flimit
60        pxor        xmm1,       xmm1
61        movdqa      xmm3,       xmm2
62
63        ;get mask
64        psubusb     xmm2,       xmm6
65        psubusb     xmm3,       xmm4
66        pcmpeqb     xmm2,       xmm1
67        pcmpeqb     xmm3,       xmm1
68
69        por         xmm7,       xmm2
70        por         xmm7,       xmm3
71
72        pavgb       xmm5,       xmm0
73
74        ;decide if or not to use filtered value
75        pand        xmm0,       xmm7
76        pandn       xmm7,       xmm5
77        paddusb     xmm0,       xmm7
78%endmacro
79
80%macro UPDATE_FLIMIT 0
81        movdqa      xmm2,       XMMWORD PTR [rbx]
82        movdqa      [rsp],      xmm2
83        add         rbx,        16
84%endmacro
85
86;void vp8_post_proc_down_and_across_mb_row_sse2
87;(
88;    unsigned char *src_ptr,
89;    unsigned char *dst_ptr,
90;    int src_pixels_per_line,
91;    int dst_pixels_per_line,
92;    int cols,
93;    int *flimits,
94;    int size
95;)
96global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE
97sym(vp8_post_proc_down_and_across_mb_row_sse2):
98    push        rbp
99    mov         rbp, rsp
100    SHADOW_ARGS_TO_STACK 7
101    SAVE_XMM 7
102    push        rbx
103    push        rsi
104    push        rdi
105    ; end prolog
106    ALIGN_STACK 16, rax
107    sub         rsp, 16
108
109        ; put flimit on stack
110        mov         rbx,        arg(5)           ;flimits ptr
111        UPDATE_FLIMIT
112
113%define flimit [rsp]
114
115        mov         rsi,        arg(0)           ;src_ptr
116        mov         rdi,        arg(1)           ;dst_ptr
117
118        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line
119        movsxd      rcx,        DWORD PTR arg(6) ;rows in a macroblock
120.nextrow:
121        xor         rdx,        rdx              ;col
122.nextcol:
123        ;load current and next 2 rows
124        movdqu      xmm0,       XMMWORD PTR [rsi]
125        movdqu      xmm1,       XMMWORD PTR [rsi + rax]
126        movdqu      xmm3,       XMMWORD PTR [rsi + 2*rax]
127
128        FIRST_2_ROWS
129
130        ;load above 2 rows
131        neg         rax
132        movdqu      xmm1,       XMMWORD PTR [rsi + 2*rax]
133        movdqu      xmm3,       XMMWORD PTR [rsi + rax]
134
135        SECOND_2_ROWS
136
137        movdqu      XMMWORD PTR [rdi], xmm0
138
139        neg         rax                          ; positive stride
140        add         rsi,        16
141        add         rdi,        16
142
143        add         rdx,        16
144        cmp         edx,        dword arg(4)     ;cols
145        jge         .downdone
146        UPDATE_FLIMIT
147        jmp         .nextcol
148
149.downdone:
150        ; done with the all cols, start the across filtering in place
151        sub         rsi,        rdx
152        sub         rdi,        rdx
153
154        mov         rbx,        arg(5) ; flimits
155        UPDATE_FLIMIT
156
157        ; dup the first byte into the left border 8 times
158        movq        mm1,   [rdi]
159        punpcklbw   mm1,   mm1
160        punpcklwd   mm1,   mm1
161        punpckldq   mm1,   mm1
162        mov         rdx,    -8
163        movq        [rdi+rdx], mm1
164
165        ; dup the last byte into the right border
166        movsxd      rdx,    dword arg(4)
167        movq        mm1,   [rdi + rdx + -1]
168        punpcklbw   mm1,   mm1
169        punpcklwd   mm1,   mm1
170        punpckldq   mm1,   mm1
171        movq        [rdi+rdx], mm1
172
173        xor         rdx,        rdx
174        movq        mm0,        QWORD PTR [rdi-16];
175        movq        mm1,        QWORD PTR [rdi-8];
176
177.acrossnextcol:
178        movdqu      xmm0,       XMMWORD PTR [rdi + rdx]
179        movdqu      xmm1,       XMMWORD PTR [rdi + rdx -2]
180        movdqu      xmm3,       XMMWORD PTR [rdi + rdx -1]
181
182        FIRST_2_ROWS
183
184        movdqu      xmm1,       XMMWORD PTR [rdi + rdx +1]
185        movdqu      xmm3,       XMMWORD PTR [rdi + rdx +2]
186
187        SECOND_2_ROWS
188
189        movq        QWORD PTR [rdi+rdx-16], mm0  ; store previous 8 bytes
190        movq        QWORD PTR [rdi+rdx-8], mm1   ; store previous 8 bytes
191        movdq2q     mm0,        xmm0
192        psrldq      xmm0,       8
193        movdq2q     mm1,        xmm0
194
195        add         rdx,        16
196        cmp         edx,        dword arg(4)     ;cols
197        jge         .acrossdone
198        UPDATE_FLIMIT
199        jmp         .acrossnextcol
200
201.acrossdone
202        ; last 16 pixels
203        movq        QWORD PTR [rdi+rdx-16], mm0
204
205        cmp         edx,        dword arg(4)
206        jne         .throw_last_8
207        movq        QWORD PTR [rdi+rdx-8], mm1
208.throw_last_8:
209        ; done with this rwo
210        add         rsi,rax                      ;next src line
211        mov         eax, dword arg(3)            ;dst_pixels_per_line
212        add         rdi,rax                      ;next destination
213        mov         eax, dword arg(2)            ;src_pixels_per_line
214
215        mov         rbx,        arg(5)           ;flimits
216        UPDATE_FLIMIT
217
218        dec         rcx                          ;decrement count
219        jnz         .nextrow                     ;next row
220
221    add rsp, 16
222    pop rsp
223    ; begin epilog
224    pop rdi
225    pop rsi
226    pop rbx
227    RESTORE_XMM
228    UNSHADOW_ARGS
229    pop         rbp
230    ret
231%undef flimit
232
233;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
234;                            int pitch, int rows, int cols,int flimit)
235extern sym(vp8_rv)
236global sym(vp8_mbpost_proc_down_xmm) PRIVATE
237sym(vp8_mbpost_proc_down_xmm):
238    push        rbp
239    mov         rbp, rsp
240    SHADOW_ARGS_TO_STACK 5
241    SAVE_XMM 7
242    GET_GOT     rbx
243    push        rsi
244    push        rdi
245    ; end prolog
246
247    ALIGN_STACK 16, rax
248    sub         rsp, 128+16
249
250    ; unsigned char d[16][8] at [rsp]
251    ; create flimit2 at [rsp+128]
252    mov         eax, dword ptr arg(4) ;flimit
253    mov         [rsp+128], eax
254    mov         [rsp+128+4], eax
255    mov         [rsp+128+8], eax
256    mov         [rsp+128+12], eax
257%define flimit4 [rsp+128]
258
259%if ABI_IS_32BIT=0
260    lea         r8,       [GLOBAL(sym(vp8_rv))]
261%endif
262
263    ;rows +=8;
264    add         dword arg(2), 8
265
266    ;for(c=0; c<cols; c+=8)
267.loop_col:
268            mov         rsi,        arg(0) ; s
269            pxor        xmm0,       xmm0        ;
270
271            movsxd      rax,        dword ptr arg(1) ;pitch       ;
272
273            ; this copies the last row down into the border 8 rows
274            mov         rdi,        rsi
275            mov         rdx,        arg(2)
276            sub         rdx,        9
277            imul        rdx,        rax
278            lea         rdi,        [rdi+rdx]
279            movq        xmm1,       QWORD ptr[rdi]              ; first row
280            mov         rcx,        8
281.init_borderd                                                    ; initialize borders
282            lea         rdi,        [rdi + rax]
283            movq        [rdi],      xmm1
284
285            dec         rcx
286            jne         .init_borderd
287
288            neg         rax                                     ; rax = -pitch
289
290            ; this copies the first row up into the border 8 rows
291            mov         rdi,        rsi
292            movq        xmm1,       QWORD ptr[rdi]              ; first row
293            mov         rcx,        8
294.init_border                                                    ; initialize borders
295            lea         rdi,        [rdi + rax]
296            movq        [rdi],      xmm1
297
298            dec         rcx
299            jne         .init_border
300
301
302
303            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
304            neg         rax
305
306            pxor        xmm5,       xmm5
307            pxor        xmm6,       xmm6        ;
308
309            pxor        xmm7,       xmm7        ;
310            mov         rdi,        rsi
311
312            mov         rcx,        15          ;
313
314.loop_initvar:
315            movq        xmm1,       QWORD PTR [rdi];
316            punpcklbw   xmm1,       xmm0        ;
317
318            paddw       xmm5,       xmm1        ;
319            pmullw      xmm1,       xmm1        ;
320
321            movdqa      xmm2,       xmm1        ;
322            punpcklwd   xmm1,       xmm0        ;
323
324            punpckhwd   xmm2,       xmm0        ;
325            paddd       xmm6,       xmm1        ;
326
327            paddd       xmm7,       xmm2        ;
328            lea         rdi,        [rdi+rax]   ;
329
330            dec         rcx
331            jne         .loop_initvar
332            ;save the var and sum
333            xor         rdx,        rdx
334.loop_row:
335            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
336            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
337
338            punpcklbw   xmm1,       xmm0
339            punpcklbw   xmm2,       xmm0
340
341            paddw       xmm5,       xmm2
342            psubw       xmm5,       xmm1
343
344            pmullw      xmm2,       xmm2
345            movdqa      xmm4,       xmm2
346
347            punpcklwd   xmm2,       xmm0
348            punpckhwd   xmm4,       xmm0
349
350            paddd       xmm6,       xmm2
351            paddd       xmm7,       xmm4
352
353            pmullw      xmm1,       xmm1
354            movdqa      xmm2,       xmm1
355
356            punpcklwd   xmm1,       xmm0
357            psubd       xmm6,       xmm1
358
359            punpckhwd   xmm2,       xmm0
360            psubd       xmm7,       xmm2
361
362
363            movdqa      xmm3,       xmm6
364            pslld       xmm3,       4
365
366            psubd       xmm3,       xmm6
367            movdqa      xmm1,       xmm5
368
369            movdqa      xmm4,       xmm5
370            pmullw      xmm1,       xmm1
371
372            pmulhw      xmm4,       xmm4
373            movdqa      xmm2,       xmm1
374
375            punpcklwd   xmm1,       xmm4
376            punpckhwd   xmm2,       xmm4
377
378            movdqa      xmm4,       xmm7
379            pslld       xmm4,       4
380
381            psubd       xmm4,       xmm7
382
383            psubd       xmm3,       xmm1
384            psubd       xmm4,       xmm2
385
386            psubd       xmm3,       flimit4
387            psubd       xmm4,       flimit4
388
389            psrad       xmm3,       31
390            psrad       xmm4,       31
391
392            packssdw    xmm3,       xmm4
393            packsswb    xmm3,       xmm0
394
395            movq        xmm1,       QWORD PTR [rsi+rax*8]
396
397            movq        xmm2,       xmm1
398            punpcklbw   xmm1,       xmm0
399
400            paddw       xmm1,       xmm5
401            mov         rcx,        rdx
402
403            and         rcx,        127
404%if ABI_IS_32BIT=1 && CONFIG_PIC=1
405            push        rax
406            lea         rax,        [GLOBAL(sym(vp8_rv))]
407            movdqu      xmm4,       [rax + rcx*2] ;vp8_rv[rcx*2]
408            pop         rax
409%elif ABI_IS_32BIT=0
410            movdqu      xmm4,       [r8 + rcx*2] ;vp8_rv[rcx*2]
411%else
412            movdqu      xmm4,       [sym(vp8_rv) + rcx*2]
413%endif
414
415            paddw       xmm1,       xmm4
416            ;paddw     xmm1,       eight8s
417            psraw       xmm1,       4
418
419            packuswb    xmm1,       xmm0
420            pand        xmm1,       xmm3
421
422            pandn       xmm3,       xmm2
423            por         xmm1,       xmm3
424
425            and         rcx,        15
426            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
427
428            cmp         edx,        8
429            jl          .skip_assignment
430
431            mov         rcx,        rdx
432            sub         rcx,        8
433            and         rcx,        15
434            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
435            movq        [rsi],      mm0
436
437.skip_assignment
438            lea         rsi,        [rsi+rax]
439
440            lea         rdi,        [rdi+rax]
441            add         rdx,        1
442
443            cmp         edx,        dword arg(2) ;rows
444            jl          .loop_row
445
446        add         dword arg(0), 8 ; s += 8
447        sub         dword arg(3), 8 ; cols -= 8
448        cmp         dword arg(3), 0
449        jg          .loop_col
450
451    add         rsp, 128+16
452    pop         rsp
453
454    ; begin epilog
455    pop rdi
456    pop rsi
457    RESTORE_GOT
458    RESTORE_XMM
459    UNSHADOW_ARGS
460    pop         rbp
461    ret
462%undef flimit4
463
464
465;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
466;                                int pitch, int rows, int cols,int flimit)
467global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE
468sym(vp8_mbpost_proc_across_ip_xmm):
469    push        rbp
470    mov         rbp, rsp
471    SHADOW_ARGS_TO_STACK 5
472    SAVE_XMM 7
473    GET_GOT     rbx
474    push        rsi
475    push        rdi
476    ; end prolog
477
478    ALIGN_STACK 16, rax
479    sub         rsp, 16
480
481    ; create flimit4 at [rsp]
482    mov         eax, dword ptr arg(4) ;flimit
483    mov         [rsp], eax
484    mov         [rsp+4], eax
485    mov         [rsp+8], eax
486    mov         [rsp+12], eax
487%define flimit4 [rsp]
488
489
490    ;for(r=0;r<rows;r++)
491.ip_row_loop:
492
493        xor         rdx,    rdx ;sumsq=0;
494        xor         rcx,    rcx ;sum=0;
495        mov         rsi,    arg(0); s
496
497
498        ; dup the first byte into the left border 8 times
499        movq        mm1,   [rsi]
500        punpcklbw   mm1,   mm1
501        punpcklwd   mm1,   mm1
502        punpckldq   mm1,   mm1
503
504        mov         rdi,    -8
505        movq        [rsi+rdi], mm1
506
507        ; dup the last byte into the right border
508        movsxd      rdx,    dword arg(3)
509        movq        mm1,   [rsi + rdx + -1]
510        punpcklbw   mm1,   mm1
511        punpcklwd   mm1,   mm1
512        punpckldq   mm1,   mm1
513        movq        [rsi+rdx], mm1
514
515.ip_var_loop:
516        ;for(i=-8;i<=6;i++)
517        ;{
518        ;    sumsq += s[i]*s[i];
519        ;    sum   += s[i];
520        ;}
521        movzx       eax, byte [rsi+rdi]
522        add         ecx, eax
523        mul         al
524        add         edx, eax
525        add         rdi, 1
526        cmp         rdi, 6
527        jle         .ip_var_loop
528
529
530            ;mov         rax,    sumsq
531            ;movd        xmm7,   rax
532            movd        xmm7,   edx
533
534            ;mov         rax,    sum
535            ;movd        xmm6,   rax
536            movd        xmm6,   ecx
537
538            mov         rsi,    arg(0) ;s
539            xor         rcx,    rcx
540
541            movsxd      rdx,    dword arg(3) ;cols
542            add         rdx,    8
543            pxor        mm0,    mm0
544            pxor        mm1,    mm1
545
546            pxor        xmm0,   xmm0
547.nextcol4:
548
549            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
550            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
551
552            punpcklbw   xmm1,   xmm0                    ; expanding
553            punpcklbw   xmm2,   xmm0                    ; expanding
554
555            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
556            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
557
558            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
559            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
560
561            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
562            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
563
564            paddd       xmm6,   xmm2
565            paddd       xmm7,   xmm1
566
567            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
568            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
569
570            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
571            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
572
573            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
574            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
575
576            paddd       xmm6,   xmm4
577            paddd       xmm7,   xmm3
578
579            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
580            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
581
582            paddd       xmm7,   xmm3
583            paddd       xmm6,   xmm4
584
585            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
586            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
587
588            paddd       xmm7,   xmm3
589            paddd       xmm6,   xmm4
590
591            movdqa      xmm3,   xmm6
592            pmaddwd     xmm3,   xmm3
593
594            movdqa      xmm5,   xmm7
595            pslld       xmm5,   4
596
597            psubd       xmm5,   xmm7
598            psubd       xmm5,   xmm3
599
600            psubd       xmm5,   flimit4
601            psrad       xmm5,   31
602
603            packssdw    xmm5,   xmm0
604            packsswb    xmm5,   xmm0
605
606            movd        xmm1,   DWORD PTR [rsi+rcx]
607            movq        xmm2,   xmm1
608
609            punpcklbw   xmm1,   xmm0
610            punpcklwd   xmm1,   xmm0
611
612            paddd       xmm1,   xmm6
613            paddd       xmm1,   [GLOBAL(four8s)]
614
615            psrad       xmm1,   4
616            packssdw    xmm1,   xmm0
617
618            packuswb    xmm1,   xmm0
619            pand        xmm1,   xmm5
620
621            pandn       xmm5,   xmm2
622            por         xmm5,   xmm1
623
624            movd        [rsi+rcx-8],  mm0
625            movq        mm0,    mm1
626
627            movdq2q     mm1,    xmm5
628            psrldq      xmm7,   12
629
630            psrldq      xmm6,   12
631            add         rcx,    4
632
633            cmp         rcx,    rdx
634            jl          .nextcol4
635
636        ;s+=pitch;
637        movsxd rax, dword arg(1)
638        add    arg(0), rax
639
640        sub dword arg(2), 1 ;rows-=1
641        cmp dword arg(2), 0
642        jg .ip_row_loop
643
644    add         rsp, 16
645    pop         rsp
646
647    ; begin epilog
648    pop rdi
649    pop rsi
650    RESTORE_GOT
651    RESTORE_XMM
652    UNSHADOW_ARGS
653    pop         rbp
654    ret
655%undef flimit4
656
657
658;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
659;                            unsigned char blackclamp[16],
660;                            unsigned char whiteclamp[16],
661;                            unsigned char bothclamp[16],
662;                            unsigned int Width, unsigned int Height, int Pitch)
663global sym(vp8_plane_add_noise_wmt) PRIVATE
664sym(vp8_plane_add_noise_wmt):
665    push        rbp
666    mov         rbp, rsp
667    SHADOW_ARGS_TO_STACK 8
668    GET_GOT     rbx
669    push        rsi
670    push        rdi
671    ; end prolog
672
673.addnoise_loop:
674    call sym(LIBVPX_RAND) WRT_PLT
675    mov     rcx, arg(1) ;noise
676    and     rax, 0xff
677    add     rcx, rax
678
679    ; we rely on the fact that the clamping vectors are stored contiguously
680    ; in black/white/both order. Note that we have to reload this here because
681    ; rdx could be trashed by rand()
682    mov     rdx, arg(2) ; blackclamp
683
684
685            mov     rdi, rcx
686            movsxd  rcx, dword arg(5) ;[Width]
687            mov     rsi, arg(0) ;Pos
688            xor         rax,rax
689
690.addnoise_nextset:
691            movdqu      xmm1,[rsi+rax]         ; get the source
692
693            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
694            paddusb     xmm1, [rdx+32] ;bothclamp
695            psubusb     xmm1, [rdx+16] ;whiteclamp
696
697            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
698            paddb       xmm1,xmm2              ; add it in
699            movdqu      [rsi+rax],xmm1         ; store the result
700
701            add         rax,16                 ; move to the next line
702
703            cmp         rax, rcx
704            jl          .addnoise_nextset
705
706    movsxd  rax, dword arg(7) ; Pitch
707    add     arg(0), rax ; Start += Pitch
708    sub     dword arg(6), 1   ; Height -= 1
709    jg      .addnoise_loop
710
711    ; begin epilog
712    pop rdi
713    pop rsi
714    RESTORE_GOT
715    UNSHADOW_ARGS
716    pop         rbp
717    ret
718
719
720SECTION_RODATA
721align 16
722four8s:
723    times 4 dd 8
724