1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14global sym(vp9_sad16x16_mmx) PRIVATE
15global sym(vp9_sad8x16_mmx) PRIVATE
16global sym(vp9_sad8x8_mmx) PRIVATE
17global sym(vp9_sad4x4_mmx) PRIVATE
18global sym(vp9_sad16x8_mmx) PRIVATE
19
20;unsigned int vp9_sad16x16_mmx(
21;    unsigned char *src_ptr,
22;    int  src_stride,
23;    unsigned char *ref_ptr,
24;    int  ref_stride)
25sym(vp9_sad16x16_mmx):
26    push        rbp
27    mov         rbp, rsp
28    SHADOW_ARGS_TO_STACK 4
29    push rsi
30    push rdi
31    ; end prolog
32
33        mov             rsi,        arg(0) ;src_ptr
34        mov             rdi,        arg(2) ;ref_ptr
35
36        movsxd          rax,        dword ptr arg(1) ;src_stride
37        movsxd          rdx,        dword ptr arg(3) ;ref_stride
38
39        lea             rcx,        [rsi+rax*8]
40
41        lea             rcx,        [rcx+rax*8]
42        pxor            mm7,        mm7
43
44        pxor            mm6,        mm6
45
46.x16x16sad_mmx_loop:
47
48        movq            mm0,        QWORD PTR [rsi]
49        movq            mm2,        QWORD PTR [rsi+8]
50
51        movq            mm1,        QWORD PTR [rdi]
52        movq            mm3,        QWORD PTR [rdi+8]
53
54        movq            mm4,        mm0
55        movq            mm5,        mm2
56
57        psubusb         mm0,        mm1
58        psubusb         mm1,        mm4
59
60        psubusb         mm2,        mm3
61        psubusb         mm3,        mm5
62
63        por             mm0,        mm1
64        por             mm2,        mm3
65
66        movq            mm1,        mm0
67        movq            mm3,        mm2
68
69        punpcklbw       mm0,        mm6
70        punpcklbw       mm2,        mm6
71
72        punpckhbw       mm1,        mm6
73        punpckhbw       mm3,        mm6
74
75        paddw           mm0,        mm2
76        paddw           mm1,        mm3
77
78
79        lea             rsi,        [rsi+rax]
80        add             rdi,        rdx
81
82        paddw           mm7,        mm0
83        paddw           mm7,        mm1
84
85        cmp             rsi,        rcx
86        jne             .x16x16sad_mmx_loop
87
88
89        movq            mm0,        mm7
90
91        punpcklwd       mm0,        mm6
92        punpckhwd       mm7,        mm6
93
94        paddw           mm0,        mm7
95        movq            mm7,        mm0
96
97
98        psrlq           mm0,        32
99        paddw           mm7,        mm0
100
101        movq            rax,        mm7
102
103    pop rdi
104    pop rsi
105    mov rsp, rbp
106    ; begin epilog
107    UNSHADOW_ARGS
108    pop         rbp
109    ret
110
111
112;unsigned int vp9_sad8x16_mmx(
113;    unsigned char *src_ptr,
114;    int  src_stride,
115;    unsigned char *ref_ptr,
116;    int  ref_stride)
117sym(vp9_sad8x16_mmx):
118    push        rbp
119    mov         rbp, rsp
120    SHADOW_ARGS_TO_STACK 4
121    push rsi
122    push rdi
123    ; end prolog
124
125        mov             rsi,        arg(0) ;src_ptr
126        mov             rdi,        arg(2) ;ref_ptr
127
128        movsxd          rax,        dword ptr arg(1) ;src_stride
129        movsxd          rdx,        dword ptr arg(3) ;ref_stride
130
131        lea             rcx,        [rsi+rax*8]
132
133        lea             rcx,        [rcx+rax*8]
134        pxor            mm7,        mm7
135
136        pxor            mm6,        mm6
137
138.x8x16sad_mmx_loop:
139
140        movq            mm0,        QWORD PTR [rsi]
141        movq            mm1,        QWORD PTR [rdi]
142
143        movq            mm2,        mm0
144        psubusb         mm0,        mm1
145
146        psubusb         mm1,        mm2
147        por             mm0,        mm1
148
149        movq            mm2,        mm0
150        punpcklbw       mm0,        mm6
151
152        punpckhbw       mm2,        mm6
153        lea             rsi,        [rsi+rax]
154
155        add             rdi,        rdx
156        paddw           mm7,        mm0
157
158        paddw           mm7,        mm2
159        cmp             rsi,        rcx
160
161        jne             .x8x16sad_mmx_loop
162
163        movq            mm0,        mm7
164        punpcklwd       mm0,        mm6
165
166        punpckhwd       mm7,        mm6
167        paddw           mm0,        mm7
168
169        movq            mm7,        mm0
170        psrlq           mm0,        32
171
172        paddw           mm7,        mm0
173        movq            rax,        mm7
174
175    pop rdi
176    pop rsi
177    mov rsp, rbp
178    ; begin epilog
179    UNSHADOW_ARGS
180    pop         rbp
181    ret
182
183
184;unsigned int vp9_sad8x8_mmx(
185;    unsigned char *src_ptr,
186;    int  src_stride,
187;    unsigned char *ref_ptr,
188;    int  ref_stride)
189sym(vp9_sad8x8_mmx):
190    push        rbp
191    mov         rbp, rsp
192    SHADOW_ARGS_TO_STACK 4
193    push rsi
194    push rdi
195    ; end prolog
196
197        mov             rsi,        arg(0) ;src_ptr
198        mov             rdi,        arg(2) ;ref_ptr
199
200        movsxd          rax,        dword ptr arg(1) ;src_stride
201        movsxd          rdx,        dword ptr arg(3) ;ref_stride
202
203        lea             rcx,        [rsi+rax*8]
204        pxor            mm7,        mm7
205
206        pxor            mm6,        mm6
207
208.x8x8sad_mmx_loop:
209
210        movq            mm0,        QWORD PTR [rsi]
211        movq            mm1,        QWORD PTR [rdi]
212
213        movq            mm2,        mm0
214        psubusb         mm0,        mm1
215
216        psubusb         mm1,        mm2
217        por             mm0,        mm1
218
219        movq            mm2,        mm0
220        punpcklbw       mm0,        mm6
221
222        punpckhbw       mm2,        mm6
223        paddw           mm0,        mm2
224
225        lea             rsi,       [rsi+rax]
226        add             rdi,        rdx
227
228        paddw           mm7,       mm0
229        cmp             rsi,        rcx
230
231        jne             .x8x8sad_mmx_loop
232
233        movq            mm0,        mm7
234        punpcklwd       mm0,        mm6
235
236        punpckhwd       mm7,        mm6
237        paddw           mm0,        mm7
238
239        movq            mm7,        mm0
240        psrlq           mm0,        32
241
242        paddw           mm7,        mm0
243        movq            rax,        mm7
244
245    pop rdi
246    pop rsi
247    mov rsp, rbp
248    ; begin epilog
249    UNSHADOW_ARGS
250    pop         rbp
251    ret
252
253
254;unsigned int vp9_sad4x4_mmx(
255;    unsigned char *src_ptr,
256;    int  src_stride,
257;    unsigned char *ref_ptr,
258;    int  ref_stride)
259sym(vp9_sad4x4_mmx):
260    push        rbp
261    mov         rbp, rsp
262    SHADOW_ARGS_TO_STACK 4
263    push rsi
264    push rdi
265    ; end prolog
266
267        mov             rsi,        arg(0) ;src_ptr
268        mov             rdi,        arg(2) ;ref_ptr
269
270        movsxd          rax,        dword ptr arg(1) ;src_stride
271        movsxd          rdx,        dword ptr arg(3) ;ref_stride
272
273        movd            mm0,        DWORD PTR [rsi]
274        movd            mm1,        DWORD PTR [rdi]
275
276        movd            mm2,        DWORD PTR [rsi+rax]
277        movd            mm3,        DWORD PTR [rdi+rdx]
278
279        punpcklbw       mm0,        mm2
280        punpcklbw       mm1,        mm3
281
282        movq            mm2,        mm0
283        psubusb         mm0,        mm1
284
285        psubusb         mm1,        mm2
286        por             mm0,        mm1
287
288        movq            mm2,        mm0
289        pxor            mm3,        mm3
290
291        punpcklbw       mm0,        mm3
292        punpckhbw       mm2,        mm3
293
294        paddw           mm0,        mm2
295
296        lea             rsi,        [rsi+rax*2]
297        lea             rdi,        [rdi+rdx*2]
298
299        movd            mm4,        DWORD PTR [rsi]
300        movd            mm5,        DWORD PTR [rdi]
301
302        movd            mm6,        DWORD PTR [rsi+rax]
303        movd            mm7,        DWORD PTR [rdi+rdx]
304
305        punpcklbw       mm4,        mm6
306        punpcklbw       mm5,        mm7
307
308        movq            mm6,        mm4
309        psubusb         mm4,        mm5
310
311        psubusb         mm5,        mm6
312        por             mm4,        mm5
313
314        movq            mm5,        mm4
315        punpcklbw       mm4,        mm3
316
317        punpckhbw       mm5,        mm3
318        paddw           mm4,        mm5
319
320        paddw           mm0,        mm4
321        movq            mm1,        mm0
322
323        punpcklwd       mm0,        mm3
324        punpckhwd       mm1,        mm3
325
326        paddw           mm0,        mm1
327        movq            mm1,        mm0
328
329        psrlq           mm0,        32
330        paddw           mm0,        mm1
331
332        movq            rax,        mm0
333
334    pop rdi
335    pop rsi
336    mov rsp, rbp
337    ; begin epilog
338    UNSHADOW_ARGS
339    pop         rbp
340    ret
341
342
343;unsigned int vp9_sad16x8_mmx(
344;    unsigned char *src_ptr,
345;    int  src_stride,
346;    unsigned char *ref_ptr,
347;    int  ref_stride)
348sym(vp9_sad16x8_mmx):
349    push        rbp
350    mov         rbp, rsp
351    SHADOW_ARGS_TO_STACK 4
352    push rsi
353    push rdi
354    ; end prolog
355
356        mov             rsi,        arg(0) ;src_ptr
357        mov             rdi,        arg(2) ;ref_ptr
358
359        movsxd          rax,        dword ptr arg(1) ;src_stride
360        movsxd          rdx,        dword ptr arg(3) ;ref_stride
361
362        lea             rcx,        [rsi+rax*8]
363        pxor            mm7,        mm7
364
365        pxor            mm6,        mm6
366
367.x16x8sad_mmx_loop:
368
369        movq            mm0,       [rsi]
370        movq            mm1,       [rdi]
371
372        movq            mm2,        [rsi+8]
373        movq            mm3,        [rdi+8]
374
375        movq            mm4,        mm0
376        movq            mm5,        mm2
377
378        psubusb         mm0,        mm1
379        psubusb         mm1,        mm4
380
381        psubusb         mm2,        mm3
382        psubusb         mm3,        mm5
383
384        por             mm0,        mm1
385        por             mm2,        mm3
386
387        movq            mm1,        mm0
388        movq            mm3,        mm2
389
390        punpcklbw       mm0,        mm6
391        punpckhbw       mm1,        mm6
392
393        punpcklbw       mm2,        mm6
394        punpckhbw       mm3,        mm6
395
396
397        paddw           mm0,        mm2
398        paddw           mm1,        mm3
399
400        paddw           mm0,        mm1
401        lea             rsi,        [rsi+rax]
402
403        add             rdi,        rdx
404        paddw           mm7,        mm0
405
406        cmp             rsi,        rcx
407        jne             .x16x8sad_mmx_loop
408
409        movq            mm0,        mm7
410        punpcklwd       mm0,        mm6
411
412        punpckhwd       mm7,        mm6
413        paddw           mm0,        mm7
414
415        movq            mm7,        mm0
416        psrlq           mm0,        32
417
418        paddw           mm7,        mm0
419        movq            rax,        mm7
420
421    pop rdi
422    pop rsi
423    mov rsp, rbp
424    ; begin epilog
425    UNSHADOW_ARGS
426    pop         rbp
427    ret
428