1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "vpx_ports/x86_abi_support.asm"
12
13%macro STACK_FRAME_CREATE_X3 0
14%if ABI_IS_32BIT
15  %define     src_ptr       rsi
16  %define     src_stride    rax
17  %define     ref_ptr       rdi
18  %define     ref_stride    rdx
19  %define     end_ptr       rcx
20  %define     ret_var       rbx
21  %define     result_ptr    arg(4)
22  %define     max_sad       arg(4)
23  %define     height        dword ptr arg(4)
24    push        rbp
25    mov         rbp,        rsp
26    push        rsi
27    push        rdi
28    push        rbx
29
30    mov         rsi,        arg(0)              ; src_ptr
31    mov         rdi,        arg(2)              ; ref_ptr
32
33    movsxd      rax,        dword ptr arg(1)    ; src_stride
34    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
35%else
36  %if LIBVPX_YASM_WIN64
37    SAVE_XMM 7, u
38    %define     src_ptr     rcx
39    %define     src_stride  rdx
40    %define     ref_ptr     r8
41    %define     ref_stride  r9
42    %define     end_ptr     r10
43    %define     ret_var     r11
44    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
45    %define     max_sad     [rsp+xmm_stack_space+8+4*8]
46    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
47  %else
48    %define     src_ptr     rdi
49    %define     src_stride  rsi
50    %define     ref_ptr     rdx
51    %define     ref_stride  rcx
52    %define     end_ptr     r9
53    %define     ret_var     r10
54    %define     result_ptr  r8
55    %define     max_sad     r8
56    %define     height      r8
57  %endif
58%endif
59
60%endmacro
61
62%macro STACK_FRAME_DESTROY_X3 0
63  %define     src_ptr
64  %define     src_stride
65  %define     ref_ptr
66  %define     ref_stride
67  %define     end_ptr
68  %define     ret_var
69  %define     result_ptr
70  %define     max_sad
71  %define     height
72
73%if ABI_IS_32BIT
74    pop         rbx
75    pop         rdi
76    pop         rsi
77    pop         rbp
78%else
79  %if LIBVPX_YASM_WIN64
80    RESTORE_XMM
81  %endif
82%endif
83    ret
84%endmacro
85
86SECTION .text
87
88;void vp8_copy32xn_sse3(
89;    unsigned char *src_ptr,
90;    int  src_stride,
91;    unsigned char *dst_ptr,
92;    int  dst_stride,
93;    int height);
94global sym(vp8_copy32xn_sse3) PRIVATE
95sym(vp8_copy32xn_sse3):
96
97    STACK_FRAME_CREATE_X3
98
99.block_copy_sse3_loopx4:
100        lea             end_ptr,    [src_ptr+src_stride*2]
101
102        movdqu          xmm0,       XMMWORD PTR [src_ptr]
103        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
104        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]
105        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]
106        movdqu          xmm4,       XMMWORD PTR [end_ptr]
107        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]
108        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]
109        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]
110
111        lea             src_ptr,    [src_ptr+src_stride*4]
112
113        lea             end_ptr,    [ref_ptr+ref_stride*2]
114
115        movdqa          XMMWORD PTR [ref_ptr], xmm0
116        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
117        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2
118        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
119        movdqa          XMMWORD PTR [end_ptr], xmm4
120        movdqa          XMMWORD PTR [end_ptr + 16], xmm5
121        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6
122        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
123
124        lea             ref_ptr,    [ref_ptr+ref_stride*4]
125
126        sub             height,     4
127        cmp             height,     4
128        jge             .block_copy_sse3_loopx4
129
130        ;Check to see if there is more rows need to be copied.
131        cmp             height, 0
132        je              .copy_is_done
133
134.block_copy_sse3_loop:
135        movdqu          xmm0,       XMMWORD PTR [src_ptr]
136        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
137        lea             src_ptr,    [src_ptr+src_stride]
138
139        movdqa          XMMWORD PTR [ref_ptr], xmm0
140        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
141        lea             ref_ptr,    [ref_ptr+ref_stride]
142
143        sub             height,     1
144        jne             .block_copy_sse3_loop
145
146.copy_is_done:
147    STACK_FRAME_DESTROY_X3
148