1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14SECTION .text
15
16;void copy_mem16x16_sse2(
17;    unsigned char *src,
18;    int src_stride,
19;    unsigned char *dst,
20;    int dst_stride
21;    )
22global sym(vp8_copy_mem16x16_sse2) PRIVATE
23sym(vp8_copy_mem16x16_sse2):
24    push        rbp
25    mov         rbp, rsp
26    SHADOW_ARGS_TO_STACK 4
27    push        rsi
28    push        rdi
29    ; end prolog
30
31        mov         rsi,        arg(0) ;src;
32        movdqu      xmm0,       [rsi]
33
34        movsxd      rax,        dword ptr arg(1) ;src_stride;
35        mov         rdi,        arg(2) ;dst;
36
37        movdqu      xmm1,       [rsi+rax]
38        movdqu      xmm2,       [rsi+rax*2]
39
40        movsxd      rcx,        dword ptr arg(3) ;dst_stride
41        lea         rsi,        [rsi+rax*2]
42
43        movdqa      [rdi],      xmm0
44        add         rsi,        rax
45
46        movdqa      [rdi+rcx],  xmm1
47        movdqa      [rdi+rcx*2],xmm2
48
49        lea         rdi,        [rdi+rcx*2]
50        movdqu      xmm3,       [rsi]
51
52        add         rdi,        rcx
53        movdqu      xmm4,       [rsi+rax]
54
55        movdqu      xmm5,       [rsi+rax*2]
56        lea         rsi,        [rsi+rax*2]
57
58        movdqa      [rdi],  xmm3
59        add         rsi,        rax
60
61        movdqa      [rdi+rcx],  xmm4
62        movdqa      [rdi+rcx*2],xmm5
63
64        lea         rdi,        [rdi+rcx*2]
65        movdqu      xmm0,       [rsi]
66
67        add         rdi,        rcx
68        movdqu      xmm1,       [rsi+rax]
69
70        movdqu      xmm2,       [rsi+rax*2]
71        lea         rsi,        [rsi+rax*2]
72
73        movdqa      [rdi],      xmm0
74        add         rsi,        rax
75
76        movdqa      [rdi+rcx],  xmm1
77
78        movdqa      [rdi+rcx*2],    xmm2
79        movdqu      xmm3,       [rsi]
80
81        movdqu      xmm4,       [rsi+rax]
82        lea         rdi,        [rdi+rcx*2]
83
84        add         rdi,        rcx
85        movdqu      xmm5,       [rsi+rax*2]
86
87        lea         rsi,        [rsi+rax*2]
88        movdqa      [rdi],  xmm3
89
90        add         rsi,        rax
91        movdqa      [rdi+rcx],  xmm4
92
93        movdqa      [rdi+rcx*2],xmm5
94        movdqu      xmm0,       [rsi]
95
96        lea         rdi,        [rdi+rcx*2]
97        movdqu      xmm1,       [rsi+rax]
98
99        add         rdi,        rcx
100        movdqu      xmm2,       [rsi+rax*2]
101
102        lea         rsi,        [rsi+rax*2]
103        movdqa      [rdi],      xmm0
104
105        movdqa      [rdi+rcx],  xmm1
106        movdqa      [rdi+rcx*2],xmm2
107
108        movdqu      xmm3,       [rsi+rax]
109        lea         rdi,        [rdi+rcx*2]
110
111        movdqa      [rdi+rcx],  xmm3
112
113    ; begin epilog
114    pop rdi
115    pop rsi
116    UNSHADOW_ARGS
117    pop         rbp
118    ret
119