1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
15;                            short *diff, unsigned char *Predictor,
16;                            int pitch);
17global sym(vp8_subtract_b_mmx_impl) PRIVATE
18sym(vp8_subtract_b_mmx_impl):
19    push        rbp
20    mov         rbp, rsp
21    SHADOW_ARGS_TO_STACK 5
22    push rsi
23    push rdi
24    ; end prolog
25
26
27        mov     rdi,        arg(2) ;diff
28        mov     rax,        arg(3) ;Predictor
29        mov     rsi,        arg(0) ;z
30        movsxd  rdx,        dword ptr arg(1);src_stride;
31        movsxd  rcx,        dword ptr arg(4);pitch
32        pxor    mm7,        mm7
33
34        movd    mm0,        [rsi]
35        movd    mm1,        [rax]
36        punpcklbw   mm0,    mm7
37        punpcklbw   mm1,    mm7
38        psubw   mm0,        mm1
39        movq    [rdi],      mm0
40
41
42        movd    mm0,        [rsi+rdx]
43        movd    mm1,        [rax+rcx]
44        punpcklbw   mm0,    mm7
45        punpcklbw   mm1,    mm7
46        psubw   mm0,        mm1
47        movq    [rdi+rcx*2],mm0
48
49
50        movd    mm0,        [rsi+rdx*2]
51        movd    mm1,        [rax+rcx*2]
52        punpcklbw   mm0,    mm7
53        punpcklbw   mm1,    mm7
54        psubw   mm0,        mm1
55        movq    [rdi+rcx*4],        mm0
56
57        lea     rsi,        [rsi+rdx*2]
58        lea     rcx,        [rcx+rcx*2]
59
60
61
62        movd    mm0,        [rsi+rdx]
63        movd    mm1,        [rax+rcx]
64        punpcklbw   mm0,    mm7
65        punpcklbw   mm1,    mm7
66        psubw   mm0,        mm1
67        movq    [rdi+rcx*2],        mm0
68
69    ; begin epilog
70    pop rdi
71    pop rsi
72    UNSHADOW_ARGS
73    pop         rbp
74    ret
75
76;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride,
77;unsigned char *pred, int pred_stride)
78global sym(vp8_subtract_mby_mmx) PRIVATE
79sym(vp8_subtract_mby_mmx):
80    push        rbp
81    mov         rbp, rsp
82    SHADOW_ARGS_TO_STACK 5
83    push rsi
84    push rdi
85    ; end prolog
86
87    mov         rdi,        arg(0)          ;diff
88    mov         rsi,        arg(1)          ;src
89    movsxd      rdx,        dword ptr arg(2);src_stride
90    mov         rax,        arg(3)          ;pred
91    push        rbx
92    movsxd      rbx,        dword ptr arg(4);pred_stride
93
94    pxor        mm0,        mm0
95    mov         rcx,        16
96
97
98.submby_loop:
99    movq        mm1,        [rsi]
100    movq        mm3,        [rax]
101
102    movq        mm2,        mm1
103    movq        mm4,        mm3
104
105    punpcklbw   mm1,        mm0
106    punpcklbw   mm3,        mm0
107
108    punpckhbw   mm2,        mm0
109    punpckhbw   mm4,        mm0
110
111    psubw       mm1,        mm3
112    psubw       mm2,        mm4
113
114    movq        [rdi],      mm1
115    movq        [rdi+8],    mm2
116
117    movq        mm1,        [rsi+8]
118    movq        mm3,        [rax+8]
119
120    movq        mm2,        mm1
121    movq        mm4,        mm3
122
123    punpcklbw   mm1,        mm0
124    punpcklbw   mm3,        mm0
125
126    punpckhbw   mm2,        mm0
127    punpckhbw   mm4,        mm0
128
129    psubw       mm1,        mm3
130    psubw       mm2,        mm4
131
132    movq        [rdi+16],   mm1
133    movq        [rdi+24],   mm2
134    add         rdi,        32
135    lea         rax,        [rax+rbx]
136    lea         rsi,        [rsi+rdx]
137    dec         rcx
138    jnz         .submby_loop
139
140    pop rbx
141    pop rdi
142    pop rsi
143    ; begin epilog
144    UNSHADOW_ARGS
145    pop         rbp
146    ret
147
148
149;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc,
150;                         int src_stride, unsigned char *upred,
151;                         unsigned char *vpred, int pred_stride)
152
153global sym(vp8_subtract_mbuv_mmx) PRIVATE
154sym(vp8_subtract_mbuv_mmx):
155    push        rbp
156    mov         rbp, rsp
157    SHADOW_ARGS_TO_STACK 7
158    push rsi
159    push rdi
160    ; end prolog
161
162    mov         rdi,        arg(0)          ;diff
163    mov         rsi,        arg(1)          ;usrc
164    movsxd      rdx,        dword ptr arg(3);src_stride;
165    mov         rax,        arg(4)          ;upred
166    add         rdi,        256*2           ;diff = diff + 256 (shorts)
167    mov         rcx,        8
168    push        rbx
169    movsxd      rbx,        dword ptr arg(6);pred_stride
170
171    pxor        mm7,        mm7
172
173.submbu_loop:
174    movq        mm0,        [rsi]
175    movq        mm1,        [rax]
176    movq        mm3,        mm0
177    movq        mm4,        mm1
178    punpcklbw   mm0,        mm7
179    punpcklbw   mm1,        mm7
180    punpckhbw   mm3,        mm7
181    punpckhbw   mm4,        mm7
182    psubw       mm0,        mm1
183    psubw       mm3,        mm4
184    movq        [rdi],      mm0
185    movq        [rdi+8],    mm3
186    add         rdi, 16
187    add         rsi, rdx
188    add         rax, rbx
189
190    dec         rcx
191    jnz         .submbu_loop
192
193    mov         rsi,        arg(2)          ;vsrc
194    mov         rax,        arg(5)          ;vpred
195    mov         rcx,        8
196
197.submbv_loop:
198    movq        mm0,        [rsi]
199    movq        mm1,        [rax]
200    movq        mm3,        mm0
201    movq        mm4,        mm1
202    punpcklbw   mm0,        mm7
203    punpcklbw   mm1,        mm7
204    punpckhbw   mm3,        mm7
205    punpckhbw   mm4,        mm7
206    psubw       mm0,        mm1
207    psubw       mm3,        mm4
208    movq        [rdi],      mm0
209    movq        [rdi+8],    mm3
210    add         rdi, 16
211    add         rsi, rdx
212    add         rax, rbx
213
214    dec         rcx
215    jnz         .submbv_loop
216
217    pop         rbx
218    ; begin epilog
219    pop rdi
220    pop rsi
221    UNSHADOW_ARGS
222    pop         rbp
223    ret
224