1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "vpx_ports/x86_abi_support.asm"
12
13%macro GET_PARAM_4 0
14    mov         rdx, arg(5)                 ;filter ptr
15    mov         rsi, arg(0)                 ;src_ptr
16    mov         rdi, arg(2)                 ;output_ptr
17    mov         rcx, 0x0400040
18
19    movdqa      xmm3, [rdx]                 ;load filters
20    pshuflw     xmm4, xmm3, 11111111b       ;k3
21    psrldq      xmm3, 8
22    pshuflw     xmm3, xmm3, 0b              ;k4
23    punpcklqdq  xmm4, xmm3                  ;k3k4
24
25    movq        xmm3, rcx                   ;rounding
26    pshufd      xmm3, xmm3, 0
27
28    pxor        xmm2, xmm2
29
30    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
31    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
32    movsxd      rcx, DWORD PTR arg(4)       ;output_height
33%endm
34
35%macro APPLY_FILTER_4 1
36
37    punpckldq   xmm0, xmm1                  ;two row in one register
38    punpcklbw   xmm0, xmm2                  ;unpack to word
39    pmullw      xmm0, xmm4                  ;multiply the filter factors
40
41    movdqa      xmm1, xmm0
42    psrldq      xmm1, 8
43    paddsw      xmm0, xmm1
44
45    paddsw      xmm0, xmm3                  ;rounding
46    psraw       xmm0, 7                     ;shift
47    packuswb    xmm0, xmm0                  ;pack to byte
48
49%if %1
50    movd        xmm1, [rdi]
51    pavgb       xmm0, xmm1
52%endif
53
54    movd        [rdi], xmm0
55    lea         rsi, [rsi + rax]
56    lea         rdi, [rdi + rdx]
57    dec         rcx
58%endm
59
60%macro GET_PARAM 0
61    mov         rdx, arg(5)                 ;filter ptr
62    mov         rsi, arg(0)                 ;src_ptr
63    mov         rdi, arg(2)                 ;output_ptr
64    mov         rcx, 0x0400040
65
66    movdqa      xmm7, [rdx]                 ;load filters
67
68    pshuflw     xmm6, xmm7, 11111111b       ;k3
69    pshufhw     xmm7, xmm7, 0b              ;k4
70    punpcklwd   xmm6, xmm6
71    punpckhwd   xmm7, xmm7
72
73    movq        xmm4, rcx                   ;rounding
74    pshufd      xmm4, xmm4, 0
75
76    pxor        xmm5, xmm5
77
78    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
79    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
80    movsxd      rcx, DWORD PTR arg(4)       ;output_height
81%endm
82
83%macro APPLY_FILTER_8 1
84    punpcklbw   xmm0, xmm5
85    punpcklbw   xmm1, xmm5
86
87    pmullw      xmm0, xmm6
88    pmullw      xmm1, xmm7
89    paddsw      xmm0, xmm1
90    paddsw      xmm0, xmm4                  ;rounding
91    psraw       xmm0, 7                     ;shift
92    packuswb    xmm0, xmm0                  ;pack back to byte
93%if %1
94    movq        xmm1, [rdi]
95    pavgb       xmm0, xmm1
96%endif
97    movq        [rdi], xmm0                 ;store the result
98
99    lea         rsi, [rsi + rax]
100    lea         rdi, [rdi + rdx]
101    dec         rcx
102%endm
103
104%macro APPLY_FILTER_16 1
105    punpcklbw   xmm0, xmm5
106    punpcklbw   xmm1, xmm5
107    punpckhbw   xmm2, xmm5
108    punpckhbw   xmm3, xmm5
109
110    pmullw      xmm0, xmm6
111    pmullw      xmm1, xmm7
112    pmullw      xmm2, xmm6
113    pmullw      xmm3, xmm7
114
115    paddsw      xmm0, xmm1
116    paddsw      xmm2, xmm3
117
118    paddsw      xmm0, xmm4                  ;rounding
119    paddsw      xmm2, xmm4
120    psraw       xmm0, 7                     ;shift
121    psraw       xmm2, 7
122    packuswb    xmm0, xmm2                  ;pack back to byte
123%if %1
124    movdqu      xmm1, [rdi]
125    pavgb       xmm0, xmm1
126%endif
127    movdqu      [rdi], xmm0                 ;store the result
128
129    lea         rsi, [rsi + rax]
130    lea         rdi, [rdi + rdx]
131    dec         rcx
132%endm
133
134SECTION .text
135
136global sym(vpx_filter_block1d4_v2_sse2) PRIVATE
137sym(vpx_filter_block1d4_v2_sse2):
138    push        rbp
139    mov         rbp, rsp
140    SHADOW_ARGS_TO_STACK 6
141    push        rsi
142    push        rdi
143    ; end prolog
144
145    GET_PARAM_4
146.loop:
147    movd        xmm0, [rsi]                 ;load src
148    movd        xmm1, [rsi + rax]
149
150    APPLY_FILTER_4 0
151    jnz         .loop
152
153    ; begin epilog
154    pop         rdi
155    pop         rsi
156    UNSHADOW_ARGS
157    pop         rbp
158    ret
159
160global sym(vpx_filter_block1d8_v2_sse2) PRIVATE
161sym(vpx_filter_block1d8_v2_sse2):
162    push        rbp
163    mov         rbp, rsp
164    SHADOW_ARGS_TO_STACK 6
165    SAVE_XMM 7
166    push        rsi
167    push        rdi
168    ; end prolog
169
170    GET_PARAM
171.loop:
172    movq        xmm0, [rsi]                 ;0
173    movq        xmm1, [rsi + rax]           ;1
174
175    APPLY_FILTER_8 0
176    jnz         .loop
177
178    ; begin epilog
179    pop         rdi
180    pop         rsi
181    RESTORE_XMM
182    UNSHADOW_ARGS
183    pop         rbp
184    ret
185
186global sym(vpx_filter_block1d16_v2_sse2) PRIVATE
187sym(vpx_filter_block1d16_v2_sse2):
188    push        rbp
189    mov         rbp, rsp
190    SHADOW_ARGS_TO_STACK 6
191    SAVE_XMM 7
192    push        rsi
193    push        rdi
194    ; end prolog
195
196    GET_PARAM
197.loop:
198    movdqu        xmm0, [rsi]               ;0
199    movdqu        xmm1, [rsi + rax]         ;1
200    movdqa        xmm2, xmm0
201    movdqa        xmm3, xmm1
202
203    APPLY_FILTER_16 0
204    jnz         .loop
205
206    ; begin epilog
207    pop         rdi
208    pop         rsi
209    RESTORE_XMM
210    UNSHADOW_ARGS
211    pop         rbp
212    ret
213
214global sym(vpx_filter_block1d4_v2_avg_sse2) PRIVATE
215sym(vpx_filter_block1d4_v2_avg_sse2):
216    push        rbp
217    mov         rbp, rsp
218    SHADOW_ARGS_TO_STACK 6
219    push        rsi
220    push        rdi
221    ; end prolog
222
223    GET_PARAM_4
224.loop:
225    movd        xmm0, [rsi]                 ;load src
226    movd        xmm1, [rsi + rax]
227
228    APPLY_FILTER_4 1
229    jnz         .loop
230
231    ; begin epilog
232    pop         rdi
233    pop         rsi
234    UNSHADOW_ARGS
235    pop         rbp
236    ret
237
238global sym(vpx_filter_block1d8_v2_avg_sse2) PRIVATE
239sym(vpx_filter_block1d8_v2_avg_sse2):
240    push        rbp
241    mov         rbp, rsp
242    SHADOW_ARGS_TO_STACK 6
243    SAVE_XMM 7
244    push        rsi
245    push        rdi
246    ; end prolog
247
248    GET_PARAM
249.loop:
250    movq        xmm0, [rsi]                 ;0
251    movq        xmm1, [rsi + rax]           ;1
252
253    APPLY_FILTER_8 1
254    jnz         .loop
255
256    ; begin epilog
257    pop         rdi
258    pop         rsi
259    RESTORE_XMM
260    UNSHADOW_ARGS
261    pop         rbp
262    ret
263
264global sym(vpx_filter_block1d16_v2_avg_sse2) PRIVATE
265sym(vpx_filter_block1d16_v2_avg_sse2):
266    push        rbp
267    mov         rbp, rsp
268    SHADOW_ARGS_TO_STACK 6
269    SAVE_XMM 7
270    push        rsi
271    push        rdi
272    ; end prolog
273
274    GET_PARAM
275.loop:
276    movdqu        xmm0, [rsi]               ;0
277    movdqu        xmm1, [rsi + rax]         ;1
278    movdqa        xmm2, xmm0
279    movdqa        xmm3, xmm1
280
281    APPLY_FILTER_16 1
282    jnz         .loop
283
284    ; begin epilog
285    pop         rdi
286    pop         rsi
287    RESTORE_XMM
288    UNSHADOW_ARGS
289    pop         rbp
290    ret
291
292global sym(vpx_filter_block1d4_h2_sse2) PRIVATE
293sym(vpx_filter_block1d4_h2_sse2):
294    push        rbp
295    mov         rbp, rsp
296    SHADOW_ARGS_TO_STACK 6
297    push        rsi
298    push        rdi
299    ; end prolog
300
301    GET_PARAM_4
302.loop:
303    movdqu      xmm0, [rsi]                 ;load src
304    movdqa      xmm1, xmm0
305    psrldq      xmm1, 1
306
307    APPLY_FILTER_4 0
308    jnz         .loop
309
310    ; begin epilog
311    pop         rdi
312    pop         rsi
313    UNSHADOW_ARGS
314    pop         rbp
315    ret
316
317global sym(vpx_filter_block1d8_h2_sse2) PRIVATE
318sym(vpx_filter_block1d8_h2_sse2):
319    push        rbp
320    mov         rbp, rsp
321    SHADOW_ARGS_TO_STACK 6
322    SAVE_XMM 7
323    push        rsi
324    push        rdi
325    ; end prolog
326
327    GET_PARAM
328.loop:
329    movdqu      xmm0, [rsi]                 ;load src
330    movdqa      xmm1, xmm0
331    psrldq      xmm1, 1
332
333    APPLY_FILTER_8 0
334    jnz         .loop
335
336    ; begin epilog
337    pop         rdi
338    pop         rsi
339    RESTORE_XMM
340    UNSHADOW_ARGS
341    pop         rbp
342    ret
343
344global sym(vpx_filter_block1d16_h2_sse2) PRIVATE
345sym(vpx_filter_block1d16_h2_sse2):
346    push        rbp
347    mov         rbp, rsp
348    SHADOW_ARGS_TO_STACK 6
349    SAVE_XMM 7
350    push        rsi
351    push        rdi
352    ; end prolog
353
354    GET_PARAM
355.loop:
356    movdqu      xmm0,   [rsi]               ;load src
357    movdqu      xmm1,   [rsi + 1]
358    movdqa      xmm2, xmm0
359    movdqa      xmm3, xmm1
360
361    APPLY_FILTER_16 0
362    jnz         .loop
363
364    ; begin epilog
365    pop         rdi
366    pop         rsi
367    RESTORE_XMM
368    UNSHADOW_ARGS
369    pop         rbp
370    ret
371
372global sym(vpx_filter_block1d4_h2_avg_sse2) PRIVATE
373sym(vpx_filter_block1d4_h2_avg_sse2):
374    push        rbp
375    mov         rbp, rsp
376    SHADOW_ARGS_TO_STACK 6
377    push        rsi
378    push        rdi
379    ; end prolog
380
381    GET_PARAM_4
382.loop:
383    movdqu      xmm0, [rsi]                 ;load src
384    movdqa      xmm1, xmm0
385    psrldq      xmm1, 1
386
387    APPLY_FILTER_4 1
388    jnz         .loop
389
390    ; begin epilog
391    pop         rdi
392    pop         rsi
393    UNSHADOW_ARGS
394    pop         rbp
395    ret
396
397global sym(vpx_filter_block1d8_h2_avg_sse2) PRIVATE
398sym(vpx_filter_block1d8_h2_avg_sse2):
399    push        rbp
400    mov         rbp, rsp
401    SHADOW_ARGS_TO_STACK 6
402    SAVE_XMM 7
403    push        rsi
404    push        rdi
405    ; end prolog
406
407    GET_PARAM
408.loop:
409    movdqu      xmm0, [rsi]                 ;load src
410    movdqa      xmm1, xmm0
411    psrldq      xmm1, 1
412
413    APPLY_FILTER_8 1
414    jnz         .loop
415
416    ; begin epilog
417    pop         rdi
418    pop         rsi
419    RESTORE_XMM
420    UNSHADOW_ARGS
421    pop         rbp
422    ret
423
424global sym(vpx_filter_block1d16_h2_avg_sse2) PRIVATE
425sym(vpx_filter_block1d16_h2_avg_sse2):
426    push        rbp
427    mov         rbp, rsp
428    SHADOW_ARGS_TO_STACK 6
429    SAVE_XMM 7
430    push        rsi
431    push        rdi
432    ; end prolog
433
434    GET_PARAM
435.loop:
436    movdqu      xmm0,   [rsi]               ;load src
437    movdqu      xmm1,   [rsi + 1]
438    movdqa      xmm2, xmm0
439    movdqa      xmm3, xmm1
440
441    APPLY_FILTER_16 1
442    jnz         .loop
443
444    ; begin epilog
445    pop         rdi
446    pop         rsi
447    RESTORE_XMM
448    UNSHADOW_ARGS
449    pop         rbp
450    ret
451