1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%define BLOCK_HEIGHT_WIDTH 4
15%define vp8_filter_weight 128
16%define VP8_FILTER_SHIFT  7
17
18SECTION .text
19
20;void vp8_filter_block1d_h6_mmx
21;(
22;    unsigned char   *src_ptr,
23;    unsigned short  *output_ptr,
24;    unsigned int    src_pixels_per_line,
25;    unsigned int    pixel_step,
26;    unsigned int    output_height,
27;    unsigned int    output_width,
28;    short           * vp8_filter
29;)
30global sym(vp8_filter_block1d_h6_mmx) PRIVATE
31sym(vp8_filter_block1d_h6_mmx):
32    push        rbp
33    mov         rbp, rsp
34    SHADOW_ARGS_TO_STACK 7
35    GET_GOT     rbx
36    push        rsi
37    push        rdi
38    ; end prolog
39
40        mov         rdx,    arg(6) ;vp8_filter
41
42        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
43        movq        mm2,    [rdx + 32]         ;
44        movq        mm6,    [rdx + 48]        ;
45        movq        mm7,    [rdx + 64]        ;
46
47        mov         rdi,    arg(1) ;output_ptr
48        mov         rsi,    arg(0) ;src_ptr
49        movsxd      rcx,    dword ptr arg(4) ;output_height
50        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
51        pxor        mm0,    mm0              ; mm0 = 00000000
52
53.nextrow:
54        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
55        movq        mm4,    mm3              ; mm4 = p-2..p5
56        psrlq       mm3,    8                ; mm3 = p-1..p5
57        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
58        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
59
60        movq        mm5,    mm4              ; mm5 = p-2..p5
61        punpckhbw   mm4,    mm0              ; mm5 = p2..p5
62        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
63        paddsw      mm3,    mm4              ; mm3 += mm5
64
65        movq        mm4,    mm5              ; mm4 = p-2..p5;
66        psrlq       mm5,    16               ; mm5 = p0..p5;
67        punpcklbw   mm5,    mm0              ; mm5 = p0..p3
68        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
69        paddsw      mm3,    mm5              ; mm3 += mm5
70
71        movq        mm5,    mm4              ; mm5 = p-2..p5
72        psrlq       mm4,    24               ; mm4 = p1..p5
73        punpcklbw   mm4,    mm0              ; mm4 = p1..p4
74        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
75        paddsw      mm3,    mm4              ; mm3 += mm5
76
77        ; do outer positive taps
78        movd        mm4,    [rsi+3]
79        punpcklbw   mm4,    mm0              ; mm5 = p3..p6
80        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
81        paddsw      mm3,    mm4              ; mm3 += mm5
82
83        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
84        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
85        paddsw      mm3,    mm5              ; mm3 += mm5
86
87        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
88        psraw       mm3,    VP8_FILTER_SHIFT     ; mm3 /= 128
89        packuswb    mm3,    mm0              ; pack and unpack to saturate
90        punpcklbw   mm3,    mm0              ;
91
92        movq        [rdi],  mm3              ; store the results in the destination
93
94%if ABI_IS_32BIT
95        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
96        add         rdi,    rax;
97%else
98        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
99        add         rdi,    rax;
100
101        add         rsi,    r8               ; next line
102%endif
103
104        dec         rcx                      ; decrement count
105        jnz         .nextrow                 ; next row
106
107    ; begin epilog
108    pop rdi
109    pop rsi
110    RESTORE_GOT
111    UNSHADOW_ARGS
112    pop         rbp
113    ret
114
115
116;void vp8_filter_block1dc_v6_mmx
117;(
118;   short *src_ptr,
119;   unsigned char *output_ptr,
120;    int output_pitch,
121;   unsigned int pixels_per_line,
122;   unsigned int pixel_step,
123;   unsigned int output_height,
124;   unsigned int output_width,
125;   short * vp8_filter
126;)
127global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
128sym(vp8_filter_block1dc_v6_mmx):
129    push        rbp
130    mov         rbp, rsp
131    SHADOW_ARGS_TO_STACK 8
132    GET_GOT     rbx
133    push        rsi
134    push        rdi
135    ; end prolog
136
137        movq      mm5, [GLOBAL(rd)]
138        push        rbx
139        mov         rbx, arg(7) ;vp8_filter
140        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
141        movq      mm2, [rbx + 32]         ;
142        movq      mm6, [rbx + 48]        ;
143        movq      mm7, [rbx + 64]        ;
144
145        movsxd      rdx, dword ptr arg(3) ;pixels_per_line
146        mov         rdi, arg(1) ;output_ptr
147        mov         rsi, arg(0) ;src_ptr
148        sub         rsi, rdx
149        sub         rsi, rdx
150        movsxd      rcx, DWORD PTR arg(5) ;output_height
151        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
152        pxor        mm0, mm0              ; mm0 = 00000000
153
154
155.nextrow_cv:
156        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
157        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
158
159
160        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
161        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
162        paddsw      mm3, mm4              ; mm3 += mm4
163
164        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
165        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
166        paddsw      mm3, mm4              ; mm3 += mm4
167
168        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
169        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
170        paddsw      mm3, mm4              ; mm3 += mm4
171
172
173        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
174        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
175        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
176        paddsw      mm3, mm4              ; mm3 += mm4
177
178        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
179        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
180        paddsw      mm3, mm4              ; mm3 += mm4
181
182
183        paddsw      mm3, mm5               ; mm3 += round value
184        psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
185        packuswb    mm3, mm0              ; pack and saturate
186
187        movd        [rdi],mm3             ; store the results in the destination
188        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
189        ; recon block should be in cache this shouldn't cost much.  Its obviously
190        ; avoidable!!!.
191        lea         rdi,  [rdi+rax] ;
192        dec         rcx                   ; decrement count
193        jnz         .nextrow_cv           ; next row
194
195        pop         rbx
196
197    ; begin epilog
198    pop rdi
199    pop rsi
200    RESTORE_GOT
201    UNSHADOW_ARGS
202    pop         rbp
203    ret
204
205
206SECTION_RODATA
207align 16
208rd:
209    times 4 dw 0x40
210
211align 16
212global HIDDEN_DATA(sym(vp8_six_tap_x86))
213sym(vp8_six_tap_x86):
214    times 8 dw 0
215    times 8 dw 0
216    times 8 dw 128
217    times 8 dw 0
218    times 8 dw 0
219    times 8 dw 0
220
221    times 8 dw 0
222    times 8 dw -6
223    times 8 dw 123
224    times 8 dw 12
225    times 8 dw -1
226    times 8 dw 0
227
228    times 8 dw 2
229    times 8 dw -11
230    times 8 dw 108
231    times 8 dw 36
232    times 8 dw -8
233    times 8 dw 1
234
235    times 8 dw 0
236    times 8 dw -9
237    times 8 dw 93
238    times 8 dw 50
239    times 8 dw -6
240    times 8 dw 0
241
242    times 8 dw 3
243    times 8 dw -16
244    times 8 dw 77
245    times 8 dw 77
246    times 8 dw -16
247    times 8 dw 3
248
249    times 8 dw 0
250    times 8 dw -6
251    times 8 dw 50
252    times 8 dw 93
253    times 8 dw -9
254    times 8 dw 0
255
256    times 8 dw 1
257    times 8 dw -8
258    times 8 dw 36
259    times 8 dw 108
260    times 8 dw -11
261    times 8 dw 2
262
263    times 8 dw 0
264    times 8 dw -1
265    times 8 dw 12
266    times 8 dw 123
267    times 8 dw -6
268    times 8 dw 0
269
270
271