1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION .text 14 15%macro convolve_fn 1 16INIT_XMM sse2 17cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ 18 fx, fxs, fy, fys, w, h 19 mov r4d, dword wm 20 cmp r4d, 4 21 je .w4 22 cmp r4d, 8 23 je .w8 24 cmp r4d, 16 25 je .w16 26 cmp r4d, 32 27 je .w32 28 29 mov r4d, dword hm 30.loop64: 31 movu m0, [srcq] 32 movu m1, [srcq+16] 33 movu m2, [srcq+32] 34 movu m3, [srcq+48] 35 add srcq, src_strideq 36%ifidn %1, avg 37 pavgb m0, [dstq] 38 pavgb m1, [dstq+16] 39 pavgb m2, [dstq+32] 40 pavgb m3, [dstq+48] 41%endif 42 mova [dstq ], m0 43 mova [dstq+16], m1 44 mova [dstq+32], m2 45 mova [dstq+48], m3 46 add dstq, dst_strideq 47 dec r4d 48 jnz .loop64 49 RET 50 51.w32: 52 mov r4d, dword hm 53.loop32: 54 movu m0, [srcq] 55 movu m1, [srcq+16] 56 movu m2, [srcq+src_strideq] 57 movu m3, [srcq+src_strideq+16] 58 lea srcq, [srcq+src_strideq*2] 59%ifidn %1, avg 60 pavgb m0, [dstq] 61 pavgb m1, [dstq +16] 62 pavgb m2, [dstq+dst_strideq] 63 pavgb m3, [dstq+dst_strideq+16] 64%endif 65 mova [dstq ], m0 66 mova [dstq +16], m1 67 mova [dstq+dst_strideq ], m2 68 mova [dstq+dst_strideq+16], m3 69 lea dstq, [dstq+dst_strideq*2] 70 sub r4d, 2 71 jnz .loop32 72 RET 73 74.w16: 75 mov r4d, dword hm 76 lea r5q, [src_strideq*3] 77 lea r6q, [dst_strideq*3] 78.loop16: 79 movu m0, [srcq] 80 movu m1, [srcq+src_strideq] 81 movu m2, [srcq+src_strideq*2] 82 movu m3, [srcq+r5q] 83 lea srcq, [srcq+src_strideq*4] 84%ifidn %1, avg 85 pavgb m0, [dstq] 86 pavgb m1, [dstq+dst_strideq] 87 pavgb m2, [dstq+dst_strideq*2] 88 pavgb m3, [dstq+r6q] 89%endif 90 mova [dstq ], m0 91 mova [dstq+dst_strideq ], m1 92 mova [dstq+dst_strideq*2], m2 93 mova [dstq+r6q ], m3 94 lea dstq, [dstq+dst_strideq*4] 95 sub r4d, 4 96 jnz .loop16 97 RET 98 99INIT_MMX sse 100.w8: 101 mov r4d, dword hm 102 lea r5q, [src_strideq*3] 103 lea r6q, [dst_strideq*3] 104.loop8: 105 movu m0, [srcq] 106 movu m1, [srcq+src_strideq] 107 movu m2, [srcq+src_strideq*2] 108 movu m3, [srcq+r5q] 109 lea srcq, [srcq+src_strideq*4] 110%ifidn %1, avg 111 pavgb m0, [dstq] 112 pavgb m1, [dstq+dst_strideq] 113 pavgb m2, [dstq+dst_strideq*2] 114 pavgb m3, [dstq+r6q] 115%endif 116 mova [dstq ], m0 117 mova [dstq+dst_strideq ], m1 118 mova [dstq+dst_strideq*2], m2 119 mova [dstq+r6q ], m3 120 lea dstq, [dstq+dst_strideq*4] 121 sub r4d, 4 122 jnz .loop8 123 RET 124 125.w4: 126 mov r4d, dword hm 127 lea r5q, [src_strideq*3] 128 lea r6q, [dst_strideq*3] 129.loop4: 130 movh m0, [srcq] 131 movh m1, [srcq+src_strideq] 132 movh m2, [srcq+src_strideq*2] 133 movh m3, [srcq+r5q] 134 lea srcq, [srcq+src_strideq*4] 135%ifidn %1, avg 136 movh m4, [dstq] 137 movh m5, [dstq+dst_strideq] 138 movh m6, [dstq+dst_strideq*2] 139 movh m7, [dstq+r6q] 140 pavgb m0, m4 141 pavgb m1, m5 142 pavgb m2, m6 143 pavgb m3, m7 144%endif 145 movh [dstq ], m0 146 movh [dstq+dst_strideq ], m1 147 movh [dstq+dst_strideq*2], m2 148 movh [dstq+r6q ], m3 149 lea dstq, [dstq+dst_strideq*4] 150 sub r4d, 4 151 jnz .loop4 152 RET 153%endmacro 154 155convolve_fn copy 156convolve_fn avg 157