1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "vpx_ports/x86_abi_support.asm"
12
13%macro GET_PARAM_4 0
14    mov         rdx, arg(5)                 ;filter ptr
15    mov         rsi, arg(0)                 ;src_ptr
16    mov         rdi, arg(2)                 ;output_ptr
17    mov         rcx, 0x0400040
18
19    movdqa      xmm3, [rdx]                 ;load filters
20    psrldq      xmm3, 6
21    packsswb    xmm3, xmm3
22    pshuflw     xmm3, xmm3, 0b              ;k3_k4
23
24    movq        xmm2, rcx                   ;rounding
25    pshufd      xmm2, xmm2, 0
26
27    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
28    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
29    movsxd      rcx, DWORD PTR arg(4)       ;output_height
30%endm
31
32%macro APPLY_FILTER_4 1
33    punpcklbw   xmm0, xmm1
34    pmaddubsw   xmm0, xmm3
35
36    paddsw      xmm0, xmm2                  ;rounding
37    psraw       xmm0, 7                     ;shift
38    packuswb    xmm0, xmm0                  ;pack to byte
39
40%if %1
41    movd        xmm1, [rdi]
42    pavgb       xmm0, xmm1
43%endif
44    movd        [rdi], xmm0
45    lea         rsi, [rsi + rax]
46    lea         rdi, [rdi + rdx]
47    dec         rcx
48%endm
49
50%macro GET_PARAM 0
51    mov         rdx, arg(5)                 ;filter ptr
52    mov         rsi, arg(0)                 ;src_ptr
53    mov         rdi, arg(2)                 ;output_ptr
54    mov         rcx, 0x0400040
55
56    movdqa      xmm7, [rdx]                 ;load filters
57    psrldq      xmm7, 6
58    packsswb    xmm7, xmm7
59    pshuflw     xmm7, xmm7, 0b              ;k3_k4
60    punpcklwd   xmm7, xmm7
61
62    movq        xmm6, rcx                   ;rounding
63    pshufd      xmm6, xmm6, 0
64
65    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
66    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
67    movsxd      rcx, DWORD PTR arg(4)       ;output_height
68%endm
69
70%macro APPLY_FILTER_8 1
71    punpcklbw   xmm0, xmm1
72    pmaddubsw   xmm0, xmm7
73
74    paddsw      xmm0, xmm6                  ;rounding
75    psraw       xmm0, 7                     ;shift
76    packuswb    xmm0, xmm0                  ;pack back to byte
77
78%if %1
79    movq        xmm1, [rdi]
80    pavgb       xmm0, xmm1
81%endif
82    movq        [rdi], xmm0                 ;store the result
83
84    lea         rsi, [rsi + rax]
85    lea         rdi, [rdi + rdx]
86    dec         rcx
87%endm
88
89%macro APPLY_FILTER_16 1
90    punpcklbw   xmm0, xmm1
91    punpckhbw   xmm2, xmm1
92    pmaddubsw   xmm0, xmm7
93    pmaddubsw   xmm2, xmm7
94
95    paddsw      xmm0, xmm6                  ;rounding
96    paddsw      xmm2, xmm6
97    psraw       xmm0, 7                     ;shift
98    psraw       xmm2, 7
99    packuswb    xmm0, xmm2                  ;pack back to byte
100
101%if %1
102    movdqu      xmm1, [rdi]
103    pavgb       xmm0, xmm1
104%endif
105    movdqu      [rdi], xmm0                 ;store the result
106
107    lea         rsi, [rsi + rax]
108    lea         rdi, [rdi + rdx]
109    dec         rcx
110%endm
111
112global sym(vp9_filter_block1d4_v2_ssse3) PRIVATE
113sym(vp9_filter_block1d4_v2_ssse3):
114    push        rbp
115    mov         rbp, rsp
116    SHADOW_ARGS_TO_STACK 6
117    push        rsi
118    push        rdi
119    ; end prolog
120
121    GET_PARAM_4
122.loop:
123    movd        xmm0, [rsi]                 ;load src
124    movd        xmm1, [rsi + rax]
125
126    APPLY_FILTER_4 0
127    jnz         .loop
128
129    ; begin epilog
130    pop         rdi
131    pop         rsi
132    UNSHADOW_ARGS
133    pop         rbp
134    ret
135
136global sym(vp9_filter_block1d8_v2_ssse3) PRIVATE
137sym(vp9_filter_block1d8_v2_ssse3):
138    push        rbp
139    mov         rbp, rsp
140    SHADOW_ARGS_TO_STACK 6
141    SAVE_XMM 7
142    push        rsi
143    push        rdi
144    ; end prolog
145
146    GET_PARAM
147.loop:
148    movq        xmm0, [rsi]                 ;0
149    movq        xmm1, [rsi + rax]           ;1
150
151    APPLY_FILTER_8 0
152    jnz         .loop
153
154    ; begin epilog
155    pop         rdi
156    pop         rsi
157    RESTORE_XMM
158    UNSHADOW_ARGS
159    pop         rbp
160    ret
161
162global sym(vp9_filter_block1d16_v2_ssse3) PRIVATE
163sym(vp9_filter_block1d16_v2_ssse3):
164    push        rbp
165    mov         rbp, rsp
166    SHADOW_ARGS_TO_STACK 6
167    SAVE_XMM 7
168    push        rsi
169    push        rdi
170    ; end prolog
171
172    GET_PARAM
173.loop:
174    movdqu        xmm0, [rsi]               ;0
175    movdqu        xmm1, [rsi + rax]         ;1
176    movdqa        xmm2, xmm0
177
178    APPLY_FILTER_16 0
179    jnz         .loop
180
181    ; begin epilog
182    pop         rdi
183    pop         rsi
184    RESTORE_XMM
185    UNSHADOW_ARGS
186    pop         rbp
187    ret
188
189global sym(vp9_filter_block1d4_v2_avg_ssse3) PRIVATE
190sym(vp9_filter_block1d4_v2_avg_ssse3):
191    push        rbp
192    mov         rbp, rsp
193    SHADOW_ARGS_TO_STACK 6
194    push        rsi
195    push        rdi
196    ; end prolog
197
198    GET_PARAM_4
199.loop:
200    movd        xmm0, [rsi]                 ;load src
201    movd        xmm1, [rsi + rax]
202
203    APPLY_FILTER_4 1
204    jnz         .loop
205
206    ; begin epilog
207    pop         rdi
208    pop         rsi
209    UNSHADOW_ARGS
210    pop         rbp
211    ret
212
213global sym(vp9_filter_block1d8_v2_avg_ssse3) PRIVATE
214sym(vp9_filter_block1d8_v2_avg_ssse3):
215    push        rbp
216    mov         rbp, rsp
217    SHADOW_ARGS_TO_STACK 6
218    SAVE_XMM 7
219    push        rsi
220    push        rdi
221    ; end prolog
222
223    GET_PARAM
224.loop:
225    movq        xmm0, [rsi]                 ;0
226    movq        xmm1, [rsi + rax]           ;1
227
228    APPLY_FILTER_8 1
229    jnz         .loop
230
231    ; begin epilog
232    pop         rdi
233    pop         rsi
234    RESTORE_XMM
235    UNSHADOW_ARGS
236    pop         rbp
237    ret
238
239global sym(vp9_filter_block1d16_v2_avg_ssse3) PRIVATE
240sym(vp9_filter_block1d16_v2_avg_ssse3):
241    push        rbp
242    mov         rbp, rsp
243    SHADOW_ARGS_TO_STACK 6
244    SAVE_XMM 7
245    push        rsi
246    push        rdi
247    ; end prolog
248
249    GET_PARAM
250.loop:
251    movdqu        xmm0, [rsi]               ;0
252    movdqu        xmm1, [rsi + rax]         ;1
253    movdqa        xmm2, xmm0
254
255    APPLY_FILTER_16 1
256    jnz         .loop
257
258    ; begin epilog
259    pop         rdi
260    pop         rsi
261    RESTORE_XMM
262    UNSHADOW_ARGS
263    pop         rbp
264    ret
265
266global sym(vp9_filter_block1d4_h2_ssse3) PRIVATE
267sym(vp9_filter_block1d4_h2_ssse3):
268    push        rbp
269    mov         rbp, rsp
270    SHADOW_ARGS_TO_STACK 6
271    push        rsi
272    push        rdi
273    ; end prolog
274
275    GET_PARAM_4
276.loop:
277    movdqu      xmm0, [rsi]                 ;load src
278    movdqa      xmm1, xmm0
279    psrldq      xmm1, 1
280
281    APPLY_FILTER_4 0
282    jnz         .loop
283
284    ; begin epilog
285    pop         rdi
286    pop         rsi
287    UNSHADOW_ARGS
288    pop         rbp
289    ret
290
291global sym(vp9_filter_block1d8_h2_ssse3) PRIVATE
292sym(vp9_filter_block1d8_h2_ssse3):
293    push        rbp
294    mov         rbp, rsp
295    SHADOW_ARGS_TO_STACK 6
296    SAVE_XMM 7
297    push        rsi
298    push        rdi
299    ; end prolog
300
301    GET_PARAM
302.loop:
303    movdqu      xmm0, [rsi]                 ;load src
304    movdqa      xmm1, xmm0
305    psrldq      xmm1, 1
306
307    APPLY_FILTER_8 0
308    jnz         .loop
309
310    ; begin epilog
311    pop         rdi
312    pop         rsi
313    RESTORE_XMM
314    UNSHADOW_ARGS
315    pop         rbp
316    ret
317
318global sym(vp9_filter_block1d16_h2_ssse3) PRIVATE
319sym(vp9_filter_block1d16_h2_ssse3):
320    push        rbp
321    mov         rbp, rsp
322    SHADOW_ARGS_TO_STACK 6
323    SAVE_XMM 7
324    push        rsi
325    push        rdi
326    ; end prolog
327
328    GET_PARAM
329.loop:
330    movdqu      xmm0,   [rsi]               ;load src
331    movdqu      xmm1,   [rsi + 1]
332    movdqa      xmm2, xmm0
333
334    APPLY_FILTER_16 0
335    jnz         .loop
336
337    ; begin epilog
338    pop         rdi
339    pop         rsi
340    RESTORE_XMM
341    UNSHADOW_ARGS
342    pop         rbp
343    ret
344
345global sym(vp9_filter_block1d4_h2_avg_ssse3) PRIVATE
346sym(vp9_filter_block1d4_h2_avg_ssse3):
347    push        rbp
348    mov         rbp, rsp
349    SHADOW_ARGS_TO_STACK 6
350    push        rsi
351    push        rdi
352    ; end prolog
353
354    GET_PARAM_4
355.loop:
356    movdqu      xmm0, [rsi]                 ;load src
357    movdqa      xmm1, xmm0
358    psrldq      xmm1, 1
359
360    APPLY_FILTER_4 1
361    jnz         .loop
362
363    ; begin epilog
364    pop         rdi
365    pop         rsi
366    UNSHADOW_ARGS
367    pop         rbp
368    ret
369
370global sym(vp9_filter_block1d8_h2_avg_ssse3) PRIVATE
371sym(vp9_filter_block1d8_h2_avg_ssse3):
372    push        rbp
373    mov         rbp, rsp
374    SHADOW_ARGS_TO_STACK 6
375    SAVE_XMM 7
376    push        rsi
377    push        rdi
378    ; end prolog
379
380    GET_PARAM
381.loop:
382    movdqu      xmm0, [rsi]                 ;load src
383    movdqa      xmm1, xmm0
384    psrldq      xmm1, 1
385
386    APPLY_FILTER_8 1
387    jnz         .loop
388
389    ; begin epilog
390    pop         rdi
391    pop         rsi
392    RESTORE_XMM
393    UNSHADOW_ARGS
394    pop         rbp
395    ret
396
397global sym(vp9_filter_block1d16_h2_avg_ssse3) PRIVATE
398sym(vp9_filter_block1d16_h2_avg_ssse3):
399    push        rbp
400    mov         rbp, rsp
401    SHADOW_ARGS_TO_STACK 6
402    SAVE_XMM 7
403    push        rsi
404    push        rdi
405    ; end prolog
406
407    GET_PARAM
408.loop:
409    movdqu      xmm0,   [rsi]               ;load src
410    movdqu      xmm1,   [rsi + 1]
411    movdqa      xmm2, xmm0
412
413    APPLY_FILTER_16 1
414    jnz         .loop
415
416    ; begin epilog
417    pop         rdi
418    pop         rsi
419    RESTORE_XMM
420    UNSHADOW_ARGS
421    pop         rbp
422    ret
423