1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%define VP9_FILTER_WEIGHT 128
15%define VP9_FILTER_SHIFT  7
16
17;void vp9_post_proc_down_and_across_mmx
18;(
19;    unsigned char *src_ptr,
20;    unsigned char *dst_ptr,
21;    int src_pixels_per_line,
22;    int dst_pixels_per_line,
23;    int rows,
24;    int cols,
25;    int flimit
26;)
27global sym(vp9_post_proc_down_and_across_mmx) PRIVATE
28sym(vp9_post_proc_down_and_across_mmx):
29    push        rbp
30    mov         rbp, rsp
31    SHADOW_ARGS_TO_STACK 7
32    GET_GOT     rbx
33    push        rsi
34    push        rdi
35    ; end prolog
36
37%if ABI_IS_32BIT=1 && CONFIG_PIC=1
38    ; move the global rd onto the stack, since we don't have enough registers
39    ; to do PIC addressing
40    movq        mm0, [GLOBAL(rd)]
41    sub         rsp, 8
42    movq        [rsp], mm0
43%define RD [rsp]
44%else
45%define RD [GLOBAL(rd)]
46%endif
47
48        push        rbx
49        lea         rbx, [GLOBAL(Blur)]
50        movd        mm2, dword ptr arg(6) ;flimit
51        punpcklwd   mm2, mm2
52        punpckldq   mm2, mm2
53
54        mov         rsi,        arg(0) ;src_ptr
55        mov         rdi,        arg(1) ;dst_ptr
56
57        movsxd      rcx, DWORD PTR arg(4) ;rows
58        movsxd      rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
59        pxor        mm0, mm0              ; mm0 = 00000000
60
61.nextrow:
62
63        xor         rdx,        rdx       ; clear out rdx for use as loop counter
64.nextcol:
65
66        pxor        mm7, mm7              ; mm7 = 00000000
67        movq        mm6, [rbx + 32 ]      ; mm6 = kernel 2 taps
68        movq        mm3, [rsi]            ; mm4 = r0 p0..p7
69        punpcklbw   mm3, mm0              ; mm3 = p0..p3
70        movq        mm1, mm3              ; mm1 = p0..p3
71        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
72
73        movq        mm6, [rbx + 48]       ; mm6 = kernel 3 taps
74        movq        mm5, [rsi + rax]      ; mm4 = r1 p0..p7
75        punpcklbw   mm5, mm0              ; mm5 = r1 p0..p3
76        pmullw      mm6, mm5              ; mm6 *= p0..p3 * kernel 3 modifiers
77        paddusw     mm3, mm6              ; mm3 += mm6
78
79        ; thresholding
80        movq        mm7, mm1              ; mm7 = r0 p0..p3
81        psubusw     mm7, mm5              ; mm7 = r0 p0..p3 - r1 p0..p3
82        psubusw     mm5, mm1              ; mm5 = r1 p0..p3 - r0 p0..p3
83        paddusw     mm7, mm5              ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
84        pcmpgtw     mm7, mm2
85
86        movq        mm6, [rbx + 64 ]      ; mm6 = kernel 4 modifiers
87        movq        mm5, [rsi + 2*rax]    ; mm4 = r2 p0..p7
88        punpcklbw   mm5, mm0              ; mm5 = r2 p0..p3
89        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
90        paddusw     mm3, mm6              ; mm3 += mm5
91
92        ; thresholding
93        movq        mm6, mm1              ; mm6 = r0 p0..p3
94        psubusw     mm6, mm5              ; mm6 = r0 p0..p3 - r2 p0..p3
95        psubusw     mm5, mm1              ; mm5 = r2 p0..p3 - r2 p0..p3
96        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
97        pcmpgtw     mm6, mm2
98        por         mm7, mm6              ; accumulate thresholds
99
100
101        neg         rax
102        movq        mm6, [rbx ]           ; kernel 0 taps
103        movq        mm5, [rsi+2*rax]      ; mm4 = r-2 p0..p7
104        punpcklbw   mm5, mm0              ; mm5 = r-2 p0..p3
105        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
106        paddusw     mm3, mm6              ; mm3 += mm5
107
108        ; thresholding
109        movq        mm6, mm1              ; mm6 = r0 p0..p3
110        psubusw     mm6, mm5              ; mm6 = p0..p3 - r-2 p0..p3
111        psubusw     mm5, mm1              ; mm5 = r-2 p0..p3 - p0..p3
112        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
113        pcmpgtw     mm6, mm2
114        por         mm7, mm6              ; accumulate thresholds
115
116        movq        mm6, [rbx + 16]       ; kernel 1 taps
117        movq        mm4, [rsi+rax]        ; mm4 = r-1 p0..p7
118        punpcklbw   mm4, mm0              ; mm4 = r-1 p0..p3
119        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
120        paddusw     mm3, mm6              ; mm3 += mm5
121
122        ; thresholding
123        movq        mm6, mm1              ; mm6 = r0 p0..p3
124        psubusw     mm6, mm4              ; mm6 = p0..p3 - r-2 p0..p3
125        psubusw     mm4, mm1              ; mm5 = r-1 p0..p3 - p0..p3
126        paddusw     mm6, mm4              ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
127        pcmpgtw     mm6, mm2
128        por         mm7, mm6              ; accumulate thresholds
129
130
131        paddusw     mm3, RD               ; mm3 += round value
132        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
133
134        pand        mm1, mm7              ; mm1 select vals > thresh from source
135        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
136        paddusw     mm1, mm7              ; combination
137
138        packuswb    mm1, mm0              ; pack to bytes
139
140        movd        [rdi], mm1            ;
141        neg         rax                   ; pitch is positive
142
143
144        add         rsi, 4
145        add         rdi, 4
146        add         rdx, 4
147
148        cmp         edx, dword ptr arg(5) ;cols
149        jl          .nextcol
150        ; done with the all cols, start the across filtering in place
151        sub         rsi, rdx
152        sub         rdi, rdx
153
154
155        push        rax
156        xor         rdx,    rdx
157        mov         rax,    [rdi-4];
158
159.acrossnextcol:
160        pxor        mm7, mm7              ; mm7 = 00000000
161        movq        mm6, [rbx + 32 ]      ;
162        movq        mm4, [rdi+rdx]        ; mm4 = p0..p7
163        movq        mm3, mm4              ; mm3 = p0..p7
164        punpcklbw   mm3, mm0              ; mm3 = p0..p3
165        movq        mm1, mm3              ; mm1 = p0..p3
166        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
167
168        movq        mm6, [rbx + 48]
169        psrlq       mm4, 8                ; mm4 = p1..p7
170        movq        mm5, mm4              ; mm5 = p1..p7
171        punpcklbw   mm5, mm0              ; mm5 = p1..p4
172        pmullw      mm6, mm5              ; mm6 *= p1..p4 * kernel 3 modifiers
173        paddusw     mm3, mm6              ; mm3 += mm6
174
175        ; thresholding
176        movq        mm7, mm1              ; mm7 = p0..p3
177        psubusw     mm7, mm5              ; mm7 = p0..p3 - p1..p4
178        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
179        paddusw     mm7, mm5              ; mm7 = abs(p0..p3 - p1..p4)
180        pcmpgtw     mm7, mm2
181
182        movq        mm6, [rbx + 64 ]
183        psrlq       mm4, 8                ; mm4 = p2..p7
184        movq        mm5, mm4              ; mm5 = p2..p7
185        punpcklbw   mm5, mm0              ; mm5 = p2..p5
186        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
187        paddusw     mm3, mm6              ; mm3 += mm5
188
189        ; thresholding
190        movq        mm6, mm1              ; mm6 = p0..p3
191        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
192        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
193        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
194        pcmpgtw     mm6, mm2
195        por         mm7, mm6              ; accumulate thresholds
196
197
198        movq        mm6, [rbx ]
199        movq        mm4, [rdi+rdx-2]      ; mm4 = p-2..p5
200        movq        mm5, mm4              ; mm5 = p-2..p5
201        punpcklbw   mm5, mm0              ; mm5 = p-2..p1
202        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
203        paddusw     mm3, mm6              ; mm3 += mm5
204
205        ; thresholding
206        movq        mm6, mm1              ; mm6 = p0..p3
207        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
208        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
209        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
210        pcmpgtw     mm6, mm2
211        por         mm7, mm6              ; accumulate thresholds
212
213        movq        mm6, [rbx + 16]
214        psrlq       mm4, 8                ; mm4 = p-1..p5
215        punpcklbw   mm4, mm0              ; mm4 = p-1..p2
216        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
217        paddusw     mm3, mm6              ; mm3 += mm5
218
219        ; thresholding
220        movq        mm6, mm1              ; mm6 = p0..p3
221        psubusw     mm6, mm4              ; mm6 = p0..p3 - p1..p4
222        psubusw     mm4, mm1              ; mm5 = p1..p4 - p0..p3
223        paddusw     mm6, mm4              ; mm6 = abs(p0..p3 - p1..p4)
224        pcmpgtw     mm6, mm2
225        por         mm7, mm6              ; accumulate thresholds
226
227        paddusw     mm3, RD               ; mm3 += round value
228        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
229
230        pand        mm1, mm7              ; mm1 select vals > thresh from source
231        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
232        paddusw     mm1, mm7              ; combination
233
234        packuswb    mm1, mm0              ; pack to bytes
235        mov         DWORD PTR [rdi+rdx-4],  eax   ; store previous four bytes
236        movd        eax,    mm1
237
238        add         rdx, 4
239        cmp         edx, dword ptr arg(5) ;cols
240        jl          .acrossnextcol;
241
242        mov         DWORD PTR [rdi+rdx-4],  eax
243        pop         rax
244
245        ; done with this rwo
246        add         rsi,rax               ; next line
247        movsxd      rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
248        add         rdi,rax               ; next destination
249        movsxd      rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
250
251        dec         rcx                   ; decrement count
252        jnz         .nextrow               ; next row
253        pop         rbx
254
255    ; begin epilog
256    pop rdi
257    pop rsi
258    RESTORE_GOT
259    UNSHADOW_ARGS
260    pop         rbp
261    ret
262%undef RD
263
264
265;void vp9_mbpost_proc_down_mmx(unsigned char *dst,
266;                             int pitch, int rows, int cols,int flimit)
267extern sym(vp9_rv)
268global sym(vp9_mbpost_proc_down_mmx) PRIVATE
269sym(vp9_mbpost_proc_down_mmx):
270    push        rbp
271    mov         rbp, rsp
272    SHADOW_ARGS_TO_STACK 5
273    GET_GOT     rbx
274    push        rsi
275    push        rdi
276    ; end prolog
277
278    ALIGN_STACK 16, rax
279    sub         rsp, 136
280
281    ; unsigned char d[16][8] at [rsp]
282    ; create flimit2 at [rsp+128]
283    mov         eax, dword ptr arg(4) ;flimit
284    mov         [rsp+128], eax
285    mov         [rsp+128+4], eax
286%define flimit2 [rsp+128]
287
288%if ABI_IS_32BIT=0
289    lea         r8,       [GLOBAL(sym(vp9_rv))]
290%endif
291
292    ;rows +=8;
293    add         dword ptr arg(2), 8
294
295    ;for(c=0; c<cols; c+=4)
296.loop_col:
297            mov         rsi,        arg(0)  ;s
298            pxor        mm0,        mm0     ;
299
300            movsxd      rax,        dword ptr arg(1) ;pitch       ;
301            neg         rax                                     ; rax = -pitch
302
303            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
304            neg         rax
305
306
307            pxor        mm5,        mm5
308            pxor        mm6,        mm6     ;
309
310            pxor        mm7,        mm7     ;
311            mov         rdi,        rsi
312
313            mov         rcx,        15          ;
314
315.loop_initvar:
316            movd        mm1,        DWORD PTR [rdi];
317            punpcklbw   mm1,        mm0     ;
318
319            paddw       mm5,        mm1     ;
320            pmullw      mm1,        mm1     ;
321
322            movq        mm2,        mm1     ;
323            punpcklwd   mm1,        mm0     ;
324
325            punpckhwd   mm2,        mm0     ;
326            paddd       mm6,        mm1     ;
327
328            paddd       mm7,        mm2     ;
329            lea         rdi,        [rdi+rax]   ;
330
331            dec         rcx
332            jne         .loop_initvar
333            ;save the var and sum
334            xor         rdx,        rdx
335.loop_row:
336            movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]
337            movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]
338
339            punpcklbw   mm1,        mm0
340            punpcklbw   mm2,        mm0
341
342            paddw       mm5,        mm2
343            psubw       mm5,        mm1
344
345            pmullw      mm2,        mm2
346            movq        mm4,        mm2
347
348            punpcklwd   mm2,        mm0
349            punpckhwd   mm4,        mm0
350
351            paddd       mm6,        mm2
352            paddd       mm7,        mm4
353
354            pmullw      mm1,        mm1
355            movq        mm2,        mm1
356
357            punpcklwd   mm1,        mm0
358            psubd       mm6,        mm1
359
360            punpckhwd   mm2,        mm0
361            psubd       mm7,        mm2
362
363
364            movq        mm3,        mm6
365            pslld       mm3,        4
366
367            psubd       mm3,        mm6
368            movq        mm1,        mm5
369
370            movq        mm4,        mm5
371            pmullw      mm1,        mm1
372
373            pmulhw      mm4,        mm4
374            movq        mm2,        mm1
375
376            punpcklwd   mm1,        mm4
377            punpckhwd   mm2,        mm4
378
379            movq        mm4,        mm7
380            pslld       mm4,        4
381
382            psubd       mm4,        mm7
383
384            psubd       mm3,        mm1
385            psubd       mm4,        mm2
386
387            psubd       mm3,        flimit2
388            psubd       mm4,        flimit2
389
390            psrad       mm3,        31
391            psrad       mm4,        31
392
393            packssdw    mm3,        mm4
394            packsswb    mm3,        mm0
395
396            movd        mm1,        DWORD PTR [rsi+rax*8]
397
398            movq        mm2,        mm1
399            punpcklbw   mm1,        mm0
400
401            paddw       mm1,        mm5
402            mov         rcx,        rdx
403
404            and         rcx,        127
405%if ABI_IS_32BIT=1 && CONFIG_PIC=1
406            push        rax
407            lea         rax,        [GLOBAL(sym(vp9_rv))]
408            movq        mm4,        [rax + rcx*2] ;vp9_rv[rcx*2]
409            pop         rax
410%elif ABI_IS_32BIT=0
411            movq        mm4,        [r8 + rcx*2] ;vp9_rv[rcx*2]
412%else
413            movq        mm4,        [sym(vp9_rv) + rcx*2]
414%endif
415            paddw       mm1,        mm4
416            ;paddw     xmm1,       eight8s
417            psraw       mm1,        4
418
419            packuswb    mm1,        mm0
420            pand        mm1,        mm3
421
422            pandn       mm3,        mm2
423            por         mm1,        mm3
424
425            and         rcx,        15
426            movd        DWORD PTR   [rsp+rcx*4], mm1 ;d[rcx*4]
427
428            mov         rcx,        rdx
429            sub         rcx,        8
430
431            and         rcx,        15
432            movd        mm1,        DWORD PTR [rsp+rcx*4] ;d[rcx*4]
433
434            movd        [rsi],      mm1
435            lea         rsi,        [rsi+rax]
436
437            lea         rdi,        [rdi+rax]
438            add         rdx,        1
439
440            cmp         edx,        dword arg(2) ;rows
441            jl          .loop_row
442
443
444        add         dword arg(0), 4 ; s += 4
445        sub         dword arg(3), 4 ; cols -= 4
446        cmp         dword arg(3), 0
447        jg          .loop_col
448
449    add         rsp, 136
450    pop         rsp
451
452    ; begin epilog
453    pop rdi
454    pop rsi
455    RESTORE_GOT
456    UNSHADOW_ARGS
457    pop         rbp
458    ret
459%undef flimit2
460
461
462;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise,
463;                            unsigned char blackclamp[16],
464;                            unsigned char whiteclamp[16],
465;                            unsigned char bothclamp[16],
466;                            unsigned int width, unsigned int height, int pitch)
467extern sym(rand)
468global sym(vp9_plane_add_noise_mmx) PRIVATE
469sym(vp9_plane_add_noise_mmx):
470    push        rbp
471    mov         rbp, rsp
472    SHADOW_ARGS_TO_STACK 8
473    GET_GOT     rbx
474    push        rsi
475    push        rdi
476    ; end prolog
477
478.addnoise_loop:
479    call sym(rand) WRT_PLT
480    mov     rcx, arg(1) ;noise
481    and     rax, 0xff
482    add     rcx, rax
483
484    ; we rely on the fact that the clamping vectors are stored contiguously
485    ; in black/white/both order. Note that we have to reload this here because
486    ; rdx could be trashed by rand()
487    mov     rdx, arg(2) ; blackclamp
488
489
490            mov     rdi, rcx
491            movsxd  rcx, dword arg(5) ;[Width]
492            mov     rsi, arg(0) ;Pos
493            xor         rax,rax
494
495.addnoise_nextset:
496            movq        mm1,[rsi+rax]         ; get the source
497
498            psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
499            paddusb     mm1, [rdx+32] ;bothclamp
500            psubusb     mm1, [rdx+16] ;whiteclamp
501
502            movq        mm2,[rdi+rax]         ; get the noise for this line
503            paddb       mm1,mm2              ; add it in
504            movq        [rsi+rax],mm1         ; store the result
505
506            add         rax,8                 ; move to the next line
507
508            cmp         rax, rcx
509            jl          .addnoise_nextset
510
511    movsxd  rax, dword arg(7) ; Pitch
512    add     arg(0), rax ; Start += Pitch
513    sub     dword arg(6), 1   ; Height -= 1
514    jg      .addnoise_loop
515
516    ; begin epilog
517    pop rdi
518    pop rsi
519    RESTORE_GOT
520    UNSHADOW_ARGS
521    pop         rbp
522    ret
523
524
525SECTION_RODATA
526align 16
527Blur:
528    times 16 dw 16
529    times  8 dw 64
530    times 16 dw 16
531    times  8 dw  0
532
533rd:
534    times 4 dw 0x40
535