1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "vpx_ports/x86_abi_support.asm"
12
13%macro GET_PARAM_4 0
14    mov         rdx, arg(5)                 ;filter ptr
15    mov         rsi, arg(0)                 ;src_ptr
16    mov         rdi, arg(2)                 ;output_ptr
17    mov         rcx, 0x0400040
18
19    movdqa      xmm3, [rdx]                 ;load filters
20    pshuflw     xmm4, xmm3, 11111111b       ;k3
21    psrldq      xmm3, 8
22    pshuflw     xmm3, xmm3, 0b              ;k4
23    punpcklqdq  xmm4, xmm3                  ;k3k4
24
25    movq        xmm3, rcx                   ;rounding
26    pshufd      xmm3, xmm3, 0
27
28    pxor        xmm2, xmm2
29
30    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
31    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
32    movsxd      rcx, DWORD PTR arg(4)       ;output_height
33%endm
34
35%macro APPLY_FILTER_4 1
36
37    punpckldq   xmm0, xmm1                  ;two row in one register
38    punpcklbw   xmm0, xmm2                  ;unpack to word
39    pmullw      xmm0, xmm4                  ;multiply the filter factors
40
41    movdqa      xmm1, xmm0
42    psrldq      xmm1, 8
43    paddsw      xmm0, xmm1
44
45    paddsw      xmm0, xmm3                  ;rounding
46    psraw       xmm0, 7                     ;shift
47    packuswb    xmm0, xmm0                  ;pack to byte
48
49%if %1
50    movd        xmm1, [rdi]
51    pavgb       xmm0, xmm1
52%endif
53
54    movd        [rdi], xmm0
55    lea         rsi, [rsi + rax]
56    lea         rdi, [rdi + rdx]
57    dec         rcx
58%endm
59
60%macro GET_PARAM 0
61    mov         rdx, arg(5)                 ;filter ptr
62    mov         rsi, arg(0)                 ;src_ptr
63    mov         rdi, arg(2)                 ;output_ptr
64    mov         rcx, 0x0400040
65
66    movdqa      xmm7, [rdx]                 ;load filters
67
68    pshuflw     xmm6, xmm7, 11111111b       ;k3
69    pshufhw     xmm7, xmm7, 0b              ;k4
70    punpcklwd   xmm6, xmm6
71    punpckhwd   xmm7, xmm7
72
73    movq        xmm4, rcx                   ;rounding
74    pshufd      xmm4, xmm4, 0
75
76    pxor        xmm5, xmm5
77
78    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
79    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
80    movsxd      rcx, DWORD PTR arg(4)       ;output_height
81%endm
82
83%macro APPLY_FILTER_8 1
84    punpcklbw   xmm0, xmm5
85    punpcklbw   xmm1, xmm5
86
87    pmullw      xmm0, xmm6
88    pmullw      xmm1, xmm7
89    paddsw      xmm0, xmm1
90    paddsw      xmm0, xmm4                  ;rounding
91    psraw       xmm0, 7                     ;shift
92    packuswb    xmm0, xmm0                  ;pack back to byte
93%if %1
94    movq        xmm1, [rdi]
95    pavgb       xmm0, xmm1
96%endif
97    movq        [rdi], xmm0                 ;store the result
98
99    lea         rsi, [rsi + rax]
100    lea         rdi, [rdi + rdx]
101    dec         rcx
102%endm
103
104%macro APPLY_FILTER_16 1
105    punpcklbw   xmm0, xmm5
106    punpcklbw   xmm1, xmm5
107    punpckhbw   xmm2, xmm5
108    punpckhbw   xmm3, xmm5
109
110    pmullw      xmm0, xmm6
111    pmullw      xmm1, xmm7
112    pmullw      xmm2, xmm6
113    pmullw      xmm3, xmm7
114
115    paddsw      xmm0, xmm1
116    paddsw      xmm2, xmm3
117
118    paddsw      xmm0, xmm4                  ;rounding
119    paddsw      xmm2, xmm4
120    psraw       xmm0, 7                     ;shift
121    psraw       xmm2, 7
122    packuswb    xmm0, xmm2                  ;pack back to byte
123%if %1
124    movdqu      xmm1, [rdi]
125    pavgb       xmm0, xmm1
126%endif
127    movdqu      [rdi], xmm0                 ;store the result
128
129    lea         rsi, [rsi + rax]
130    lea         rdi, [rdi + rdx]
131    dec         rcx
132%endm
133
134global sym(vp9_filter_block1d4_v2_sse2) PRIVATE
135sym(vp9_filter_block1d4_v2_sse2):
136    push        rbp
137    mov         rbp, rsp
138    SHADOW_ARGS_TO_STACK 6
139    push        rsi
140    push        rdi
141    ; end prolog
142
143    GET_PARAM_4
144.loop:
145    movd        xmm0, [rsi]                 ;load src
146    movd        xmm1, [rsi + rax]
147
148    APPLY_FILTER_4 0
149    jnz         .loop
150
151    ; begin epilog
152    pop         rdi
153    pop         rsi
154    UNSHADOW_ARGS
155    pop         rbp
156    ret
157
158global sym(vp9_filter_block1d8_v2_sse2) PRIVATE
159sym(vp9_filter_block1d8_v2_sse2):
160    push        rbp
161    mov         rbp, rsp
162    SHADOW_ARGS_TO_STACK 6
163    SAVE_XMM 7
164    push        rsi
165    push        rdi
166    ; end prolog
167
168    GET_PARAM
169.loop:
170    movq        xmm0, [rsi]                 ;0
171    movq        xmm1, [rsi + rax]           ;1
172
173    APPLY_FILTER_8 0
174    jnz         .loop
175
176    ; begin epilog
177    pop         rdi
178    pop         rsi
179    RESTORE_XMM
180    UNSHADOW_ARGS
181    pop         rbp
182    ret
183
184global sym(vp9_filter_block1d16_v2_sse2) PRIVATE
185sym(vp9_filter_block1d16_v2_sse2):
186    push        rbp
187    mov         rbp, rsp
188    SHADOW_ARGS_TO_STACK 6
189    SAVE_XMM 7
190    push        rsi
191    push        rdi
192    ; end prolog
193
194    GET_PARAM
195.loop:
196    movdqu        xmm0, [rsi]               ;0
197    movdqu        xmm1, [rsi + rax]         ;1
198    movdqa        xmm2, xmm0
199    movdqa        xmm3, xmm1
200
201    APPLY_FILTER_16 0
202    jnz         .loop
203
204    ; begin epilog
205    pop         rdi
206    pop         rsi
207    RESTORE_XMM
208    UNSHADOW_ARGS
209    pop         rbp
210    ret
211
212global sym(vp9_filter_block1d4_v2_avg_sse2) PRIVATE
213sym(vp9_filter_block1d4_v2_avg_sse2):
214    push        rbp
215    mov         rbp, rsp
216    SHADOW_ARGS_TO_STACK 6
217    push        rsi
218    push        rdi
219    ; end prolog
220
221    GET_PARAM_4
222.loop:
223    movd        xmm0, [rsi]                 ;load src
224    movd        xmm1, [rsi + rax]
225
226    APPLY_FILTER_4 1
227    jnz         .loop
228
229    ; begin epilog
230    pop         rdi
231    pop         rsi
232    UNSHADOW_ARGS
233    pop         rbp
234    ret
235
236global sym(vp9_filter_block1d8_v2_avg_sse2) PRIVATE
237sym(vp9_filter_block1d8_v2_avg_sse2):
238    push        rbp
239    mov         rbp, rsp
240    SHADOW_ARGS_TO_STACK 6
241    SAVE_XMM 7
242    push        rsi
243    push        rdi
244    ; end prolog
245
246    GET_PARAM
247.loop:
248    movq        xmm0, [rsi]                 ;0
249    movq        xmm1, [rsi + rax]           ;1
250
251    APPLY_FILTER_8 1
252    jnz         .loop
253
254    ; begin epilog
255    pop         rdi
256    pop         rsi
257    RESTORE_XMM
258    UNSHADOW_ARGS
259    pop         rbp
260    ret
261
262global sym(vp9_filter_block1d16_v2_avg_sse2) PRIVATE
263sym(vp9_filter_block1d16_v2_avg_sse2):
264    push        rbp
265    mov         rbp, rsp
266    SHADOW_ARGS_TO_STACK 6
267    SAVE_XMM 7
268    push        rsi
269    push        rdi
270    ; end prolog
271
272    GET_PARAM
273.loop:
274    movdqu        xmm0, [rsi]               ;0
275    movdqu        xmm1, [rsi + rax]         ;1
276    movdqa        xmm2, xmm0
277    movdqa        xmm3, xmm1
278
279    APPLY_FILTER_16 1
280    jnz         .loop
281
282    ; begin epilog
283    pop         rdi
284    pop         rsi
285    RESTORE_XMM
286    UNSHADOW_ARGS
287    pop         rbp
288    ret
289
290global sym(vp9_filter_block1d4_h2_sse2) PRIVATE
291sym(vp9_filter_block1d4_h2_sse2):
292    push        rbp
293    mov         rbp, rsp
294    SHADOW_ARGS_TO_STACK 6
295    push        rsi
296    push        rdi
297    ; end prolog
298
299    GET_PARAM_4
300.loop:
301    movdqu      xmm0, [rsi]                 ;load src
302    movdqa      xmm1, xmm0
303    psrldq      xmm1, 1
304
305    APPLY_FILTER_4 0
306    jnz         .loop
307
308    ; begin epilog
309    pop         rdi
310    pop         rsi
311    UNSHADOW_ARGS
312    pop         rbp
313    ret
314
315global sym(vp9_filter_block1d8_h2_sse2) PRIVATE
316sym(vp9_filter_block1d8_h2_sse2):
317    push        rbp
318    mov         rbp, rsp
319    SHADOW_ARGS_TO_STACK 6
320    SAVE_XMM 7
321    push        rsi
322    push        rdi
323    ; end prolog
324
325    GET_PARAM
326.loop:
327    movdqu      xmm0, [rsi]                 ;load src
328    movdqa      xmm1, xmm0
329    psrldq      xmm1, 1
330
331    APPLY_FILTER_8 0
332    jnz         .loop
333
334    ; begin epilog
335    pop         rdi
336    pop         rsi
337    RESTORE_XMM
338    UNSHADOW_ARGS
339    pop         rbp
340    ret
341
342global sym(vp9_filter_block1d16_h2_sse2) PRIVATE
343sym(vp9_filter_block1d16_h2_sse2):
344    push        rbp
345    mov         rbp, rsp
346    SHADOW_ARGS_TO_STACK 6
347    SAVE_XMM 7
348    push        rsi
349    push        rdi
350    ; end prolog
351
352    GET_PARAM
353.loop:
354    movdqu      xmm0,   [rsi]               ;load src
355    movdqu      xmm1,   [rsi + 1]
356    movdqa      xmm2, xmm0
357    movdqa      xmm3, xmm1
358
359    APPLY_FILTER_16 0
360    jnz         .loop
361
362    ; begin epilog
363    pop         rdi
364    pop         rsi
365    RESTORE_XMM
366    UNSHADOW_ARGS
367    pop         rbp
368    ret
369
370global sym(vp9_filter_block1d4_h2_avg_sse2) PRIVATE
371sym(vp9_filter_block1d4_h2_avg_sse2):
372    push        rbp
373    mov         rbp, rsp
374    SHADOW_ARGS_TO_STACK 6
375    push        rsi
376    push        rdi
377    ; end prolog
378
379    GET_PARAM_4
380.loop:
381    movdqu      xmm0, [rsi]                 ;load src
382    movdqa      xmm1, xmm0
383    psrldq      xmm1, 1
384
385    APPLY_FILTER_4 1
386    jnz         .loop
387
388    ; begin epilog
389    pop         rdi
390    pop         rsi
391    UNSHADOW_ARGS
392    pop         rbp
393    ret
394
395global sym(vp9_filter_block1d8_h2_avg_sse2) PRIVATE
396sym(vp9_filter_block1d8_h2_avg_sse2):
397    push        rbp
398    mov         rbp, rsp
399    SHADOW_ARGS_TO_STACK 6
400    SAVE_XMM 7
401    push        rsi
402    push        rdi
403    ; end prolog
404
405    GET_PARAM
406.loop:
407    movdqu      xmm0, [rsi]                 ;load src
408    movdqa      xmm1, xmm0
409    psrldq      xmm1, 1
410
411    APPLY_FILTER_8 1
412    jnz         .loop
413
414    ; begin epilog
415    pop         rdi
416    pop         rsi
417    RESTORE_XMM
418    UNSHADOW_ARGS
419    pop         rbp
420    ret
421
422global sym(vp9_filter_block1d16_h2_avg_sse2) PRIVATE
423sym(vp9_filter_block1d16_h2_avg_sse2):
424    push        rbp
425    mov         rbp, rsp
426    SHADOW_ARGS_TO_STACK 6
427    SAVE_XMM 7
428    push        rsi
429    push        rdi
430    ; end prolog
431
432    GET_PARAM
433.loop:
434    movdqu      xmm0,   [rsi]               ;load src
435    movdqu      xmm1,   [rsi + 1]
436    movdqa      xmm2, xmm0
437    movdqa      xmm3, xmm1
438
439    APPLY_FILTER_16 1
440    jnz         .loop
441
442    ; begin epilog
443    pop         rdi
444    pop         rsi
445    RESTORE_XMM
446    UNSHADOW_ARGS
447    pop         rbp
448    ret
449