1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13extern sym(vp8_bilinear_filters_x86_8)
14
15
16%define BLOCK_HEIGHT_WIDTH 4
17%define vp8_filter_weight 128
18%define VP8_FILTER_SHIFT  7
19
20
21;void vp8_filter_block1d_h6_mmx
22;(
23;    unsigned char   *src_ptr,
24;    unsigned short  *output_ptr,
25;    unsigned int    src_pixels_per_line,
26;    unsigned int    pixel_step,
27;    unsigned int    output_height,
28;    unsigned int    output_width,
29;    short           * vp8_filter
30;)
31global sym(vp8_filter_block1d_h6_mmx) PRIVATE
32sym(vp8_filter_block1d_h6_mmx):
33    push        rbp
34    mov         rbp, rsp
35    SHADOW_ARGS_TO_STACK 7
36    GET_GOT     rbx
37    push        rsi
38    push        rdi
39    ; end prolog
40
41        mov         rdx,    arg(6) ;vp8_filter
42
43        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
44        movq        mm2,    [rdx + 32]         ;
45        movq        mm6,    [rdx + 48]        ;
46        movq        mm7,    [rdx + 64]        ;
47
48        mov         rdi,    arg(1) ;output_ptr
49        mov         rsi,    arg(0) ;src_ptr
50        movsxd      rcx,    dword ptr arg(4) ;output_height
51        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
52        pxor        mm0,    mm0              ; mm0 = 00000000
53
54.nextrow:
55        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
56        movq        mm4,    mm3              ; mm4 = p-2..p5
57        psrlq       mm3,    8                ; mm3 = p-1..p5
58        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
59        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
60
61        movq        mm5,    mm4              ; mm5 = p-2..p5
62        punpckhbw   mm4,    mm0              ; mm5 = p2..p5
63        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
64        paddsw      mm3,    mm4              ; mm3 += mm5
65
66        movq        mm4,    mm5              ; mm4 = p-2..p5;
67        psrlq       mm5,    16               ; mm5 = p0..p5;
68        punpcklbw   mm5,    mm0              ; mm5 = p0..p3
69        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
70        paddsw      mm3,    mm5              ; mm3 += mm5
71
72        movq        mm5,    mm4              ; mm5 = p-2..p5
73        psrlq       mm4,    24               ; mm4 = p1..p5
74        punpcklbw   mm4,    mm0              ; mm4 = p1..p4
75        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
76        paddsw      mm3,    mm4              ; mm3 += mm5
77
78        ; do outer positive taps
79        movd        mm4,    [rsi+3]
80        punpcklbw   mm4,    mm0              ; mm5 = p3..p6
81        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
82        paddsw      mm3,    mm4              ; mm3 += mm5
83
84        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
85        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
86        paddsw      mm3,    mm5              ; mm3 += mm5
87
88        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
89        psraw       mm3,    VP8_FILTER_SHIFT     ; mm3 /= 128
90        packuswb    mm3,    mm0              ; pack and unpack to saturate
91        punpcklbw   mm3,    mm0              ;
92
93        movq        [rdi],  mm3              ; store the results in the destination
94
95%if ABI_IS_32BIT
96        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
97        add         rdi,    rax;
98%else
99        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
100        add         rdi,    rax;
101
102        add         rsi,    r8               ; next line
103%endif
104
105        dec         rcx                      ; decrement count
106        jnz         .nextrow                 ; next row
107
108    ; begin epilog
109    pop rdi
110    pop rsi
111    RESTORE_GOT
112    UNSHADOW_ARGS
113    pop         rbp
114    ret
115
116
117;void vp8_filter_block1dc_v6_mmx
118;(
119;   short *src_ptr,
120;   unsigned char *output_ptr,
121;    int output_pitch,
122;   unsigned int pixels_per_line,
123;   unsigned int pixel_step,
124;   unsigned int output_height,
125;   unsigned int output_width,
126;   short * vp8_filter
127;)
128global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
129sym(vp8_filter_block1dc_v6_mmx):
130    push        rbp
131    mov         rbp, rsp
132    SHADOW_ARGS_TO_STACK 8
133    GET_GOT     rbx
134    push        rsi
135    push        rdi
136    ; end prolog
137
138        movq      mm5, [GLOBAL(rd)]
139        push        rbx
140        mov         rbx, arg(7) ;vp8_filter
141        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
142        movq      mm2, [rbx + 32]         ;
143        movq      mm6, [rbx + 48]        ;
144        movq      mm7, [rbx + 64]        ;
145
146        movsxd      rdx, dword ptr arg(3) ;pixels_per_line
147        mov         rdi, arg(1) ;output_ptr
148        mov         rsi, arg(0) ;src_ptr
149        sub         rsi, rdx
150        sub         rsi, rdx
151        movsxd      rcx, DWORD PTR arg(5) ;output_height
152        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
153        pxor        mm0, mm0              ; mm0 = 00000000
154
155
156.nextrow_cv:
157        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
158        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
159
160
161        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
162        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
163        paddsw      mm3, mm4              ; mm3 += mm4
164
165        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
166        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
167        paddsw      mm3, mm4              ; mm3 += mm4
168
169        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
170        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
171        paddsw      mm3, mm4              ; mm3 += mm4
172
173
174        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
175        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
176        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
177        paddsw      mm3, mm4              ; mm3 += mm4
178
179        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
180        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
181        paddsw      mm3, mm4              ; mm3 += mm4
182
183
184        paddsw      mm3, mm5               ; mm3 += round value
185        psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
186        packuswb    mm3, mm0              ; pack and saturate
187
188        movd        [rdi],mm3             ; store the results in the destination
189        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
190        ; recon block should be in cache this shouldn't cost much.  Its obviously
191        ; avoidable!!!.
192        lea         rdi,  [rdi+rax] ;
193        dec         rcx                   ; decrement count
194        jnz         .nextrow_cv           ; next row
195
196        pop         rbx
197
198    ; begin epilog
199    pop rdi
200    pop rsi
201    RESTORE_GOT
202    UNSHADOW_ARGS
203    pop         rbp
204    ret
205
206
207;void bilinear_predict8x4_mmx
208;(
209;    unsigned char  *src_ptr,
210;    int   src_pixels_per_line,
211;    int  xoffset,
212;    int  yoffset,
213;    unsigned char *dst_ptr,
214;    int dst_pitch
215;)
216global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
217sym(vp8_bilinear_predict8x4_mmx):
218    push        rbp
219    mov         rbp, rsp
220    SHADOW_ARGS_TO_STACK 6
221    GET_GOT     rbx
222    push        rsi
223    push        rdi
224    ; end prolog
225
226    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
227    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
228
229        movsxd      rax,        dword ptr arg(2) ;xoffset
230        mov         rdi,        arg(4) ;dst_ptr           ;
231
232        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
233        shl         rax,        5
234
235        mov         rsi,        arg(0) ;src_ptr              ;
236        add         rax,        rcx
237
238        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
239        movq        mm1,        [rax]               ;
240
241        movq        mm2,        [rax+16]            ;
242        movsxd      rax,        dword ptr arg(3) ;yoffset
243
244        pxor        mm0,        mm0                 ;
245        shl         rax,        5
246
247        add         rax,        rcx
248        lea         rcx,        [rdi+rdx*4]          ;
249
250        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
251
252        ; get the first horizontal line done       ;
253        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
254        movq        mm4,        mm3                 ; make a copy of current line
255
256        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
257        punpckhbw   mm4,        mm0                 ;
258
259        pmullw      mm3,        mm1                 ;
260        pmullw      mm4,        mm1                 ;
261
262        movq        mm5,        [rsi+1]             ;
263        movq        mm6,        mm5                 ;
264
265        punpcklbw   mm5,        mm0                 ;
266        punpckhbw   mm6,        mm0                 ;
267
268        pmullw      mm5,        mm2                 ;
269        pmullw      mm6,        mm2                 ;
270
271        paddw       mm3,        mm5                 ;
272        paddw       mm4,        mm6                 ;
273
274        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
275        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
276
277        paddw       mm4,        [GLOBAL(rd)]                 ;
278        psraw       mm4,        VP8_FILTER_SHIFT        ;
279
280        movq        mm7,        mm3                 ;
281        packuswb    mm7,        mm4                 ;
282
283        add         rsi,        rdx                 ; next line
284.next_row_8x4:
285        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
286        movq        mm4,        mm3                 ; make a copy of current line
287
288        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
289        punpckhbw   mm4,        mm0                 ;
290
291        pmullw      mm3,        mm1                 ;
292        pmullw      mm4,        mm1                 ;
293
294        movq        mm5,        [rsi+1]             ;
295        movq        mm6,        mm5                 ;
296
297        punpcklbw   mm5,        mm0                 ;
298        punpckhbw   mm6,        mm0                 ;
299
300        pmullw      mm5,        mm2                 ;
301        pmullw      mm6,        mm2                 ;
302
303        paddw       mm3,        mm5                 ;
304        paddw       mm4,        mm6                 ;
305
306        movq        mm5,        mm7                 ;
307        movq        mm6,        mm7                 ;
308
309        punpcklbw   mm5,        mm0                 ;
310        punpckhbw   mm6,        mm0
311
312        pmullw      mm5,        [rax]               ;
313        pmullw      mm6,        [rax]               ;
314
315        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
316        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
317
318        paddw       mm4,        [GLOBAL(rd)]                 ;
319        psraw       mm4,        VP8_FILTER_SHIFT        ;
320
321        movq        mm7,        mm3                 ;
322        packuswb    mm7,        mm4                 ;
323
324
325        pmullw      mm3,        [rax+16]            ;
326        pmullw      mm4,        [rax+16]            ;
327
328        paddw       mm3,        mm5                 ;
329        paddw       mm4,        mm6                 ;
330
331
332        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
333        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
334
335        paddw       mm4,        [GLOBAL(rd)]                 ;
336        psraw       mm4,        VP8_FILTER_SHIFT        ;
337
338        packuswb    mm3,        mm4
339
340        movq        [rdi],      mm3                 ; store the results in the destination
341
342%if ABI_IS_32BIT
343        add         rsi,        rdx                 ; next line
344        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
345%else
346        movsxd      r8,         dword ptr arg(5) ;dst_pitch
347        add         rsi,        rdx                 ; next line
348        add         rdi,        r8
349%endif
350        cmp         rdi,        rcx                 ;
351        jne         .next_row_8x4
352
353    ; begin epilog
354    pop rdi
355    pop rsi
356    RESTORE_GOT
357    UNSHADOW_ARGS
358    pop         rbp
359    ret
360
361
362;void bilinear_predict4x4_mmx
363;(
364;    unsigned char  *src_ptr,
365;    int   src_pixels_per_line,
366;    int  xoffset,
367;    int  yoffset,
368;    unsigned char *dst_ptr,
369;    int dst_pitch
370;)
371global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
372sym(vp8_bilinear_predict4x4_mmx):
373    push        rbp
374    mov         rbp, rsp
375    SHADOW_ARGS_TO_STACK 6
376    GET_GOT     rbx
377    push        rsi
378    push        rdi
379    ; end prolog
380
381    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
382    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
383
384        movsxd      rax,        dword ptr arg(2) ;xoffset
385        mov         rdi,        arg(4) ;dst_ptr           ;
386
387        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
388        shl         rax,        5
389
390        add         rax,        rcx ; HFilter
391        mov         rsi,        arg(0) ;src_ptr              ;
392
393        movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
394        movq        mm1,        [rax]               ;
395
396        movq        mm2,        [rax+16]            ;
397        movsxd      rax,        dword ptr arg(3) ;yoffset
398
399        pxor        mm0,        mm0                 ;
400        shl         rax,        5
401
402        add         rax,        rcx
403        lea         rcx,        [rdi+rdx*4]          ;
404
405        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
406
407        ; get the first horizontal line done       ;
408        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
409        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
410
411        pmullw      mm3,        mm1                 ;
412        movd        mm5,        [rsi+1]             ;
413
414        punpcklbw   mm5,        mm0                 ;
415        pmullw      mm5,        mm2                 ;
416
417        paddw       mm3,        mm5                 ;
418        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
419
420        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
421
422        movq        mm7,        mm3                 ;
423        packuswb    mm7,        mm0                 ;
424
425        add         rsi,        rdx                 ; next line
426.next_row_4x4:
427        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
428        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
429
430        pmullw      mm3,        mm1                 ;
431        movd        mm5,        [rsi+1]             ;
432
433        punpcklbw   mm5,        mm0                 ;
434        pmullw      mm5,        mm2                 ;
435
436        paddw       mm3,        mm5                 ;
437
438        movq        mm5,        mm7                 ;
439        punpcklbw   mm5,        mm0                 ;
440
441        pmullw      mm5,        [rax]               ;
442        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
443
444        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
445        movq        mm7,        mm3                 ;
446
447        packuswb    mm7,        mm0                 ;
448
449        pmullw      mm3,        [rax+16]            ;
450        paddw       mm3,        mm5                 ;
451
452
453        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
454        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
455
456        packuswb    mm3,        mm0
457        movd        [rdi],      mm3                 ; store the results in the destination
458
459%if ABI_IS_32BIT
460        add         rsi,        rdx                 ; next line
461        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
462%else
463        movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
464        add         rsi,        rdx                 ; next line
465        add         rdi,        r8
466%endif
467
468        cmp         rdi,        rcx                 ;
469        jne         .next_row_4x4
470
471    ; begin epilog
472    pop rdi
473    pop rsi
474    RESTORE_GOT
475    UNSHADOW_ARGS
476    pop         rbp
477    ret
478
479
480
481SECTION_RODATA
482align 16
483rd:
484    times 4 dw 0x40
485
486align 16
487global HIDDEN_DATA(sym(vp8_six_tap_x86))
488sym(vp8_six_tap_x86):
489    times 8 dw 0
490    times 8 dw 0
491    times 8 dw 128
492    times 8 dw 0
493    times 8 dw 0
494    times 8 dw 0
495
496    times 8 dw 0
497    times 8 dw -6
498    times 8 dw 123
499    times 8 dw 12
500    times 8 dw -1
501    times 8 dw 0
502
503    times 8 dw 2
504    times 8 dw -11
505    times 8 dw 108
506    times 8 dw 36
507    times 8 dw -8
508    times 8 dw 1
509
510    times 8 dw 0
511    times 8 dw -9
512    times 8 dw 93
513    times 8 dw 50
514    times 8 dw -6
515    times 8 dw 0
516
517    times 8 dw 3
518    times 8 dw -16
519    times 8 dw 77
520    times 8 dw 77
521    times 8 dw -16
522    times 8 dw 3
523
524    times 8 dw 0
525    times 8 dw -6
526    times 8 dw 50
527    times 8 dw 93
528    times 8 dw -9
529    times 8 dw 0
530
531    times 8 dw 1
532    times 8 dw -8
533    times 8 dw 36
534    times 8 dw 108
535    times 8 dw -11
536    times 8 dw 2
537
538    times 8 dw 0
539    times 8 dw -1
540    times 8 dw 12
541    times 8 dw 123
542    times 8 dw -6
543    times 8 dw 0
544
545
546