1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14SECTION .text
15
16;int vp8_block_error_sse2(short *coeff_ptr,  short *dcoef_ptr)
17global sym(vp8_block_error_sse2) PRIVATE
18sym(vp8_block_error_sse2):
19    push        rbp
20    mov         rbp, rsp
21    SHADOW_ARGS_TO_STACK 2
22    push rsi
23    push rdi
24    ; end prologue
25
26        mov         rsi,        arg(0) ;coeff_ptr
27        mov         rdi,        arg(1) ;dcoef_ptr
28
29        movdqa      xmm0,       [rsi]
30        movdqa      xmm1,       [rdi]
31
32        movdqa      xmm2,       [rsi+16]
33        movdqa      xmm3,       [rdi+16]
34
35        psubw       xmm0,       xmm1
36        psubw       xmm2,       xmm3
37
38        pmaddwd     xmm0,       xmm0
39        pmaddwd     xmm2,       xmm2
40
41        paddd       xmm0,       xmm2
42
43        pxor        xmm5,       xmm5
44        movdqa      xmm1,       xmm0
45
46        punpckldq   xmm0,       xmm5
47        punpckhdq   xmm1,       xmm5
48
49        paddd       xmm0,       xmm1
50        movdqa      xmm1,       xmm0
51
52        psrldq      xmm0,       8
53        paddd       xmm0,       xmm1
54
55        movq        rax,        xmm0
56
57    pop rdi
58    pop rsi
59    ; begin epilog
60    UNSHADOW_ARGS
61    pop         rbp
62    ret
63
64;int vp8_mbblock_error_sse2_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
65global sym(vp8_mbblock_error_sse2_impl) PRIVATE
66sym(vp8_mbblock_error_sse2_impl):
67    push        rbp
68    mov         rbp, rsp
69    SHADOW_ARGS_TO_STACK 3
70    SAVE_XMM 6
71    push rsi
72    push rdi
73    ; end prolog
74
75
76        mov         rsi,        arg(0) ;coeff_ptr
77        pxor        xmm6,       xmm6
78
79        mov         rdi,        arg(1) ;dcoef_ptr
80        pxor        xmm4,       xmm4
81
82        movd        xmm5,       dword ptr arg(2) ;dc
83        por         xmm5,       xmm4
84
85        pcmpeqw     xmm5,       xmm6
86        mov         rcx,        16
87
88.mberror_loop:
89        movdqa      xmm0,       [rsi]
90        movdqa      xmm1,       [rdi]
91
92        movdqa      xmm2,       [rsi+16]
93        movdqa      xmm3,       [rdi+16]
94
95
96        psubw       xmm2,       xmm3
97        pmaddwd     xmm2,       xmm2
98
99        psubw       xmm0,       xmm1
100        pand        xmm0,       xmm5
101
102        pmaddwd     xmm0,       xmm0
103        add         rsi,        32
104
105        add         rdi,        32
106
107        sub         rcx,        1
108        paddd       xmm4,       xmm2
109
110        paddd       xmm4,       xmm0
111        jnz         .mberror_loop
112
113        movdqa      xmm0,       xmm4
114        punpckldq   xmm0,       xmm6
115
116        punpckhdq   xmm4,       xmm6
117        paddd       xmm0,       xmm4
118
119        movdqa      xmm1,       xmm0
120        psrldq      xmm0,       8
121
122        paddd       xmm0,       xmm1
123        movq        rax,        xmm0
124
125    pop rdi
126    pop rsi
127    ; begin epilog
128    RESTORE_XMM
129    UNSHADOW_ARGS
130    pop         rbp
131    ret
132
133
134;int vp8_mbuverror_sse2_impl(short *s_ptr, short *d_ptr);
135global sym(vp8_mbuverror_sse2_impl) PRIVATE
136sym(vp8_mbuverror_sse2_impl):
137    push        rbp
138    mov         rbp, rsp
139    SHADOW_ARGS_TO_STACK 2
140    push rsi
141    push rdi
142    ; end prolog
143
144
145        mov             rsi,        arg(0) ;s_ptr
146        mov             rdi,        arg(1) ;d_ptr
147
148        mov             rcx,        16
149        pxor            xmm3,       xmm3
150
151.mbuverror_loop:
152
153        movdqa          xmm1,       [rsi]
154        movdqa          xmm2,       [rdi]
155
156        psubw           xmm1,       xmm2
157        pmaddwd         xmm1,       xmm1
158
159        paddd           xmm3,       xmm1
160
161        add             rsi,        16
162        add             rdi,        16
163
164        dec             rcx
165        jnz             .mbuverror_loop
166
167        pxor        xmm0,           xmm0
168        movdqa      xmm1,           xmm3
169
170        movdqa      xmm2,           xmm1
171        punpckldq   xmm1,           xmm0
172
173        punpckhdq   xmm2,           xmm0
174        paddd       xmm1,           xmm2
175
176        movdqa      xmm2,           xmm1
177
178        psrldq      xmm1,           8
179        paddd       xmm1,           xmm2
180
181        movq            rax,            xmm1
182
183    pop rdi
184    pop rsi
185    ; begin epilog
186    UNSHADOW_ARGS
187    pop         rbp
188    ret
189