1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14
15;void vp8_copy32xn_sse2(
16;    unsigned char *src_ptr,
17;    int  src_stride,
18;    unsigned char *dst_ptr,
19;    int  dst_stride,
20;    int height);
21global sym(vp8_copy32xn_sse2) PRIVATE
22sym(vp8_copy32xn_sse2):
23    push        rbp
24    mov         rbp, rsp
25    SHADOW_ARGS_TO_STACK 5
26    SAVE_XMM 7
27    push        rsi
28    push        rdi
29    ; end prolog
30
31        mov             rsi,        arg(0) ;src_ptr
32        mov             rdi,        arg(2) ;dst_ptr
33
34        movsxd          rax,        dword ptr arg(1) ;src_stride
35        movsxd          rdx,        dword ptr arg(3) ;dst_stride
36        movsxd          rcx,        dword ptr arg(4) ;height
37
38.block_copy_sse2_loopx4:
39        movdqu          xmm0,       XMMWORD PTR [rsi]
40        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
41        movdqu          xmm2,       XMMWORD PTR [rsi + rax]
42        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]
43
44        lea             rsi,        [rsi+rax*2]
45
46        movdqu          xmm4,       XMMWORD PTR [rsi]
47        movdqu          xmm5,       XMMWORD PTR [rsi + 16]
48        movdqu          xmm6,       XMMWORD PTR [rsi + rax]
49        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]
50
51        lea             rsi,    [rsi+rax*2]
52
53        movdqa          XMMWORD PTR [rdi], xmm0
54        movdqa          XMMWORD PTR [rdi + 16], xmm1
55        movdqa          XMMWORD PTR [rdi + rdx], xmm2
56        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3
57
58        lea             rdi,    [rdi+rdx*2]
59
60        movdqa          XMMWORD PTR [rdi], xmm4
61        movdqa          XMMWORD PTR [rdi + 16], xmm5
62        movdqa          XMMWORD PTR [rdi + rdx], xmm6
63        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7
64
65        lea             rdi,    [rdi+rdx*2]
66
67        sub             rcx,     4
68        cmp             rcx,     4
69        jge             .block_copy_sse2_loopx4
70
71        cmp             rcx, 0
72        je              .copy_is_done
73
74.block_copy_sse2_loop:
75        movdqu          xmm0,       XMMWORD PTR [rsi]
76        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
77        lea             rsi,    [rsi+rax]
78
79        movdqa          XMMWORD PTR [rdi], xmm0
80        movdqa          XMMWORD PTR [rdi + 16], xmm1
81        lea             rdi,    [rdi+rdx]
82
83        sub             rcx,     1
84        jne             .block_copy_sse2_loop
85
86.copy_is_done:
87    ; begin epilog
88    pop rdi
89    pop rsi
90    RESTORE_XMM
91    UNSHADOW_ARGS
92    pop         rbp
93    ret
94