1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "vpx_ports/x86_abi_support.asm"
12
13%macro STACK_FRAME_CREATE_X3 0
14%if ABI_IS_32BIT
15  %define     src_ptr       rsi
16  %define     src_stride    rax
17  %define     ref_ptr       rdi
18  %define     ref_stride    rdx
19  %define     end_ptr       rcx
20  %define     ret_var       rbx
21  %define     result_ptr    arg(4)
22  %define     max_sad       arg(4)
23  %define     height        dword ptr arg(4)
24    push        rbp
25    mov         rbp,        rsp
26    push        rsi
27    push        rdi
28    push        rbx
29
30    mov         rsi,        arg(0)              ; src_ptr
31    mov         rdi,        arg(2)              ; ref_ptr
32
33    movsxd      rax,        dword ptr arg(1)    ; src_stride
34    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
35%else
36  %if LIBVPX_YASM_WIN64
37    SAVE_XMM 7, u
38    %define     src_ptr     rcx
39    %define     src_stride  rdx
40    %define     ref_ptr     r8
41    %define     ref_stride  r9
42    %define     end_ptr     r10
43    %define     ret_var     r11
44    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
45    %define     max_sad     [rsp+xmm_stack_space+8+4*8]
46    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
47  %else
48    %define     src_ptr     rdi
49    %define     src_stride  rsi
50    %define     ref_ptr     rdx
51    %define     ref_stride  rcx
52    %define     end_ptr     r9
53    %define     ret_var     r10
54    %define     result_ptr  r8
55    %define     max_sad     r8
56    %define     height      r8
57  %endif
58%endif
59
60%endmacro
61
62%macro STACK_FRAME_DESTROY_X3 0
63  %define     src_ptr
64  %define     src_stride
65  %define     ref_ptr
66  %define     ref_stride
67  %define     end_ptr
68  %define     ret_var
69  %define     result_ptr
70  %define     max_sad
71  %define     height
72
73%if ABI_IS_32BIT
74    pop         rbx
75    pop         rdi
76    pop         rsi
77    pop         rbp
78%else
79  %if LIBVPX_YASM_WIN64
80    RESTORE_XMM
81  %endif
82%endif
83    ret
84%endmacro
85
86
87;void vp8_copy32xn_sse3(
88;    unsigned char *src_ptr,
89;    int  src_stride,
90;    unsigned char *dst_ptr,
91;    int  dst_stride,
92;    int height);
93global sym(vp8_copy32xn_sse3) PRIVATE
94sym(vp8_copy32xn_sse3):
95
96    STACK_FRAME_CREATE_X3
97
98.block_copy_sse3_loopx4:
99        lea             end_ptr,    [src_ptr+src_stride*2]
100
101        movdqu          xmm0,       XMMWORD PTR [src_ptr]
102        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
103        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]
104        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]
105        movdqu          xmm4,       XMMWORD PTR [end_ptr]
106        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]
107        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]
108        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]
109
110        lea             src_ptr,    [src_ptr+src_stride*4]
111
112        lea             end_ptr,    [ref_ptr+ref_stride*2]
113
114        movdqa          XMMWORD PTR [ref_ptr], xmm0
115        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
116        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2
117        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
118        movdqa          XMMWORD PTR [end_ptr], xmm4
119        movdqa          XMMWORD PTR [end_ptr + 16], xmm5
120        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6
121        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
122
123        lea             ref_ptr,    [ref_ptr+ref_stride*4]
124
125        sub             height,     4
126        cmp             height,     4
127        jge             .block_copy_sse3_loopx4
128
129        ;Check to see if there is more rows need to be copied.
130        cmp             height, 0
131        je              .copy_is_done
132
133.block_copy_sse3_loop:
134        movdqu          xmm0,       XMMWORD PTR [src_ptr]
135        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
136        lea             src_ptr,    [src_ptr+src_stride]
137
138        movdqa          XMMWORD PTR [ref_ptr], xmm0
139        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
140        lea             ref_ptr,    [ref_ptr+ref_stride]
141
142        sub             height,     1
143        jne             .block_copy_sse3_loop
144
145.copy_is_done:
146    STACK_FRAME_DESTROY_X3
147