1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION .text
14
15%macro convolve_fn 1
16INIT_XMM sse2
17cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
18                              fx, fxs, fy, fys, w, h
19  mov r4d, dword wm
20  cmp r4d, 4
21  je .w4
22  cmp r4d, 8
23  je .w8
24  cmp r4d, 16
25  je .w16
26  cmp r4d, 32
27  je .w32
28
29  mov                    r4d, dword hm
30.loop64:
31  movu                    m0, [srcq]
32  movu                    m1, [srcq+16]
33  movu                    m2, [srcq+32]
34  movu                    m3, [srcq+48]
35  add                   srcq, src_strideq
36%ifidn %1, avg
37  pavgb                   m0, [dstq]
38  pavgb                   m1, [dstq+16]
39  pavgb                   m2, [dstq+32]
40  pavgb                   m3, [dstq+48]
41%endif
42  mova             [dstq   ], m0
43  mova             [dstq+16], m1
44  mova             [dstq+32], m2
45  mova             [dstq+48], m3
46  add                   dstq, dst_strideq
47  dec                    r4d
48  jnz .loop64
49  RET
50
51.w32:
52  mov                    r4d, dword hm
53.loop32:
54  movu                    m0, [srcq]
55  movu                    m1, [srcq+16]
56  movu                    m2, [srcq+src_strideq]
57  movu                    m3, [srcq+src_strideq+16]
58  lea                   srcq, [srcq+src_strideq*2]
59%ifidn %1, avg
60  pavgb                   m0, [dstq]
61  pavgb                   m1, [dstq            +16]
62  pavgb                   m2, [dstq+dst_strideq]
63  pavgb                   m3, [dstq+dst_strideq+16]
64%endif
65  mova [dstq               ], m0
66  mova [dstq            +16], m1
67  mova [dstq+dst_strideq   ], m2
68  mova [dstq+dst_strideq+16], m3
69  lea                   dstq, [dstq+dst_strideq*2]
70  sub                    r4d, 2
71  jnz .loop32
72  RET
73
74.w16:
75  mov                    r4d, dword hm
76  lea                    r5q, [src_strideq*3]
77  lea                    r6q, [dst_strideq*3]
78.loop16:
79  movu                    m0, [srcq]
80  movu                    m1, [srcq+src_strideq]
81  movu                    m2, [srcq+src_strideq*2]
82  movu                    m3, [srcq+r5q]
83  lea                   srcq, [srcq+src_strideq*4]
84%ifidn %1, avg
85  pavgb                   m0, [dstq]
86  pavgb                   m1, [dstq+dst_strideq]
87  pavgb                   m2, [dstq+dst_strideq*2]
88  pavgb                   m3, [dstq+r6q]
89%endif
90  mova  [dstq              ], m0
91  mova  [dstq+dst_strideq  ], m1
92  mova  [dstq+dst_strideq*2], m2
93  mova  [dstq+r6q          ], m3
94  lea                   dstq, [dstq+dst_strideq*4]
95  sub                    r4d, 4
96  jnz .loop16
97  RET
98
99INIT_MMX sse
100.w8:
101  mov                    r4d, dword hm
102  lea                    r5q, [src_strideq*3]
103  lea                    r6q, [dst_strideq*3]
104.loop8:
105  movu                    m0, [srcq]
106  movu                    m1, [srcq+src_strideq]
107  movu                    m2, [srcq+src_strideq*2]
108  movu                    m3, [srcq+r5q]
109  lea                   srcq, [srcq+src_strideq*4]
110%ifidn %1, avg
111  pavgb                   m0, [dstq]
112  pavgb                   m1, [dstq+dst_strideq]
113  pavgb                   m2, [dstq+dst_strideq*2]
114  pavgb                   m3, [dstq+r6q]
115%endif
116  mova  [dstq              ], m0
117  mova  [dstq+dst_strideq  ], m1
118  mova  [dstq+dst_strideq*2], m2
119  mova  [dstq+r6q          ], m3
120  lea                   dstq, [dstq+dst_strideq*4]
121  sub                    r4d, 4
122  jnz .loop8
123  RET
124
125.w4:
126  mov                    r4d, dword hm
127  lea                    r5q, [src_strideq*3]
128  lea                    r6q, [dst_strideq*3]
129.loop4:
130  movh                    m0, [srcq]
131  movh                    m1, [srcq+src_strideq]
132  movh                    m2, [srcq+src_strideq*2]
133  movh                    m3, [srcq+r5q]
134  lea                   srcq, [srcq+src_strideq*4]
135%ifidn %1, avg
136  movh                    m4, [dstq]
137  movh                    m5, [dstq+dst_strideq]
138  movh                    m6, [dstq+dst_strideq*2]
139  movh                    m7, [dstq+r6q]
140  pavgb                   m0, m4
141  pavgb                   m1, m5
142  pavgb                   m2, m6
143  pavgb                   m3, m7
144%endif
145  movh  [dstq              ], m0
146  movh  [dstq+dst_strideq  ], m1
147  movh  [dstq+dst_strideq*2], m2
148  movh  [dstq+r6q          ], m3
149  lea                   dstq, [dstq+dst_strideq*4]
150  sub                    r4d, 4
151  jnz .loop4
152  RET
153%endmacro
154
155convolve_fn copy
156convolve_fn avg
157