1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14SECTION .text
15
16;void vp8_copy32xn_sse2(
17;    unsigned char *src_ptr,
18;    int  src_stride,
19;    unsigned char *dst_ptr,
20;    int  dst_stride,
21;    int height);
22global sym(vp8_copy32xn_sse2) PRIVATE
23sym(vp8_copy32xn_sse2):
24    push        rbp
25    mov         rbp, rsp
26    SHADOW_ARGS_TO_STACK 5
27    SAVE_XMM 7
28    push        rsi
29    push        rdi
30    ; end prolog
31
32        mov             rsi,        arg(0) ;src_ptr
33        mov             rdi,        arg(2) ;dst_ptr
34
35        movsxd          rax,        dword ptr arg(1) ;src_stride
36        movsxd          rdx,        dword ptr arg(3) ;dst_stride
37        movsxd          rcx,        dword ptr arg(4) ;height
38
39.block_copy_sse2_loopx4:
40        movdqu          xmm0,       XMMWORD PTR [rsi]
41        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
42        movdqu          xmm2,       XMMWORD PTR [rsi + rax]
43        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]
44
45        lea             rsi,        [rsi+rax*2]
46
47        movdqu          xmm4,       XMMWORD PTR [rsi]
48        movdqu          xmm5,       XMMWORD PTR [rsi + 16]
49        movdqu          xmm6,       XMMWORD PTR [rsi + rax]
50        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]
51
52        lea             rsi,    [rsi+rax*2]
53
54        movdqa          XMMWORD PTR [rdi], xmm0
55        movdqa          XMMWORD PTR [rdi + 16], xmm1
56        movdqa          XMMWORD PTR [rdi + rdx], xmm2
57        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3
58
59        lea             rdi,    [rdi+rdx*2]
60
61        movdqa          XMMWORD PTR [rdi], xmm4
62        movdqa          XMMWORD PTR [rdi + 16], xmm5
63        movdqa          XMMWORD PTR [rdi + rdx], xmm6
64        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7
65
66        lea             rdi,    [rdi+rdx*2]
67
68        sub             rcx,     4
69        cmp             rcx,     4
70        jge             .block_copy_sse2_loopx4
71
72        cmp             rcx, 0
73        je              .copy_is_done
74
75.block_copy_sse2_loop:
76        movdqu          xmm0,       XMMWORD PTR [rsi]
77        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
78        lea             rsi,    [rsi+rax]
79
80        movdqa          XMMWORD PTR [rdi], xmm0
81        movdqa          XMMWORD PTR [rdi + 16], xmm1
82        lea             rdi,    [rdi+rdx]
83
84        sub             rcx,     1
85        jne             .block_copy_sse2_loop
86
87.copy_is_done:
88    ; begin epilog
89    pop rdi
90    pop rsi
91    RESTORE_XMM
92    UNSHADOW_ARGS
93    pop         rbp
94    ret
95