1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "vpx_ports/x86_abi_support.asm" 12 13%macro STACK_FRAME_CREATE_X3 0 14%if ABI_IS_32BIT 15 %define src_ptr rsi 16 %define src_stride rax 17 %define ref_ptr rdi 18 %define ref_stride rdx 19 %define end_ptr rcx 20 %define ret_var rbx 21 %define result_ptr arg(4) 22 %define max_sad arg(4) 23 %define height dword ptr arg(4) 24 push rbp 25 mov rbp, rsp 26 push rsi 27 push rdi 28 push rbx 29 30 mov rsi, arg(0) ; src_ptr 31 mov rdi, arg(2) ; ref_ptr 32 33 movsxd rax, dword ptr arg(1) ; src_stride 34 movsxd rdx, dword ptr arg(3) ; ref_stride 35%else 36 %if LIBVPX_YASM_WIN64 37 SAVE_XMM 7, u 38 %define src_ptr rcx 39 %define src_stride rdx 40 %define ref_ptr r8 41 %define ref_stride r9 42 %define end_ptr r10 43 %define ret_var r11 44 %define result_ptr [rsp+xmm_stack_space+8+4*8] 45 %define max_sad [rsp+xmm_stack_space+8+4*8] 46 %define height dword ptr [rsp+xmm_stack_space+8+4*8] 47 %else 48 %define src_ptr rdi 49 %define src_stride rsi 50 %define ref_ptr rdx 51 %define ref_stride rcx 52 %define end_ptr r9 53 %define ret_var r10 54 %define result_ptr r8 55 %define max_sad r8 56 %define height r8 57 %endif 58%endif 59 60%endmacro 61 62%macro STACK_FRAME_DESTROY_X3 0 63 %define src_ptr 64 %define src_stride 65 %define ref_ptr 66 %define ref_stride 67 %define end_ptr 68 %define ret_var 69 %define result_ptr 70 %define max_sad 71 %define height 72 73%if ABI_IS_32BIT 74 pop rbx 75 pop rdi 76 pop rsi 77 pop rbp 78%else 79 %if LIBVPX_YASM_WIN64 80 RESTORE_XMM 81 %endif 82%endif 83 ret 84%endmacro 85 86 87;void vp8_copy32xn_sse3( 88; unsigned char *src_ptr, 89; int src_stride, 90; unsigned char *dst_ptr, 91; int dst_stride, 92; int height); 93global sym(vp8_copy32xn_sse3) PRIVATE 94sym(vp8_copy32xn_sse3): 95 96 STACK_FRAME_CREATE_X3 97 98.block_copy_sse3_loopx4: 99 lea end_ptr, [src_ptr+src_stride*2] 100 101 movdqu xmm0, XMMWORD PTR [src_ptr] 102 movdqu xmm1, XMMWORD PTR [src_ptr + 16] 103 movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] 104 movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] 105 movdqu xmm4, XMMWORD PTR [end_ptr] 106 movdqu xmm5, XMMWORD PTR [end_ptr + 16] 107 movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] 108 movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] 109 110 lea src_ptr, [src_ptr+src_stride*4] 111 112 lea end_ptr, [ref_ptr+ref_stride*2] 113 114 movdqa XMMWORD PTR [ref_ptr], xmm0 115 movdqa XMMWORD PTR [ref_ptr + 16], xmm1 116 movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 117 movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 118 movdqa XMMWORD PTR [end_ptr], xmm4 119 movdqa XMMWORD PTR [end_ptr + 16], xmm5 120 movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 121 movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 122 123 lea ref_ptr, [ref_ptr+ref_stride*4] 124 125 sub height, 4 126 cmp height, 4 127 jge .block_copy_sse3_loopx4 128 129 ;Check to see if there is more rows need to be copied. 130 cmp height, 0 131 je .copy_is_done 132 133.block_copy_sse3_loop: 134 movdqu xmm0, XMMWORD PTR [src_ptr] 135 movdqu xmm1, XMMWORD PTR [src_ptr + 16] 136 lea src_ptr, [src_ptr+src_stride] 137 138 movdqa XMMWORD PTR [ref_ptr], xmm0 139 movdqa XMMWORD PTR [ref_ptr + 16], xmm1 140 lea ref_ptr, [ref_ptr+ref_stride] 141 142 sub height, 1 143 jne .block_copy_sse3_loop 144 145.copy_is_done: 146 STACK_FRAME_DESTROY_X3 147