1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;void copy_mem16x16_sse2(
15;    unsigned char *src,
16;    int src_stride,
17;    unsigned char *dst,
18;    int dst_stride
19;    )
20global sym(vp8_copy_mem16x16_sse2) PRIVATE
21sym(vp8_copy_mem16x16_sse2):
22    push        rbp
23    mov         rbp, rsp
24    SHADOW_ARGS_TO_STACK 4
25    push        rsi
26    push        rdi
27    ; end prolog
28
29        mov         rsi,        arg(0) ;src;
30        movdqu      xmm0,       [rsi]
31
32        movsxd      rax,        dword ptr arg(1) ;src_stride;
33        mov         rdi,        arg(2) ;dst;
34
35        movdqu      xmm1,       [rsi+rax]
36        movdqu      xmm2,       [rsi+rax*2]
37
38        movsxd      rcx,        dword ptr arg(3) ;dst_stride
39        lea         rsi,        [rsi+rax*2]
40
41        movdqa      [rdi],      xmm0
42        add         rsi,        rax
43
44        movdqa      [rdi+rcx],  xmm1
45        movdqa      [rdi+rcx*2],xmm2
46
47        lea         rdi,        [rdi+rcx*2]
48        movdqu      xmm3,       [rsi]
49
50        add         rdi,        rcx
51        movdqu      xmm4,       [rsi+rax]
52
53        movdqu      xmm5,       [rsi+rax*2]
54        lea         rsi,        [rsi+rax*2]
55
56        movdqa      [rdi],  xmm3
57        add         rsi,        rax
58
59        movdqa      [rdi+rcx],  xmm4
60        movdqa      [rdi+rcx*2],xmm5
61
62        lea         rdi,        [rdi+rcx*2]
63        movdqu      xmm0,       [rsi]
64
65        add         rdi,        rcx
66        movdqu      xmm1,       [rsi+rax]
67
68        movdqu      xmm2,       [rsi+rax*2]
69        lea         rsi,        [rsi+rax*2]
70
71        movdqa      [rdi],      xmm0
72        add         rsi,        rax
73
74        movdqa      [rdi+rcx],  xmm1
75
76        movdqa      [rdi+rcx*2],    xmm2
77        movdqu      xmm3,       [rsi]
78
79        movdqu      xmm4,       [rsi+rax]
80        lea         rdi,        [rdi+rcx*2]
81
82        add         rdi,        rcx
83        movdqu      xmm5,       [rsi+rax*2]
84
85        lea         rsi,        [rsi+rax*2]
86        movdqa      [rdi],  xmm3
87
88        add         rsi,        rax
89        movdqa      [rdi+rcx],  xmm4
90
91        movdqa      [rdi+rcx*2],xmm5
92        movdqu      xmm0,       [rsi]
93
94        lea         rdi,        [rdi+rcx*2]
95        movdqu      xmm1,       [rsi+rax]
96
97        add         rdi,        rcx
98        movdqu      xmm2,       [rsi+rax*2]
99
100        lea         rsi,        [rsi+rax*2]
101        movdqa      [rdi],      xmm0
102
103        movdqa      [rdi+rcx],  xmm1
104        movdqa      [rdi+rcx*2],xmm2
105
106        movdqu      xmm3,       [rsi+rax]
107        lea         rdi,        [rdi+rcx*2]
108
109        movdqa      [rdi+rcx],  xmm3
110
111    ; begin epilog
112    pop rdi
113    pop rsi
114    UNSHADOW_ARGS
115    pop         rbp
116    ret
117
118
119;void vp8_intra_pred_uv_dc_mmx2(
120;    unsigned char *dst,
121;    int dst_stride
122;    unsigned char *above,
123;    unsigned char *left,
124;    int left_stride,
125;    )
126global sym(vp8_intra_pred_uv_dc_mmx2) PRIVATE
127sym(vp8_intra_pred_uv_dc_mmx2):
128    push        rbp
129    mov         rbp, rsp
130    SHADOW_ARGS_TO_STACK 5
131    push        rsi
132    push        rdi
133    ; end prolog
134
135    ; from top
136    mov         rdi,        arg(2) ;above;
137    mov         rsi,        arg(3) ;left;
138    movsxd      rax,        dword ptr arg(4) ;left_stride;
139    pxor        mm0,        mm0
140    movq        mm1,        [rdi]
141    lea         rdi,        [rax*3]
142    psadbw      mm1,        mm0
143    ; from left
144    movzx       ecx,        byte [rsi]
145    movzx       edx,        byte [rsi+rax*1]
146    add         ecx,        edx
147    movzx       edx,        byte [rsi+rax*2]
148    add         ecx,        edx
149
150    movzx       edx,        byte [rsi+rdi]
151    lea         rsi,        [rsi+rax*4]
152    add         ecx,        edx
153    movzx       edx,        byte [rsi]
154    add         ecx,        edx
155    movzx       edx,        byte [rsi+rax]
156    add         ecx,        edx
157    movzx       edx,        byte [rsi+rax*2]
158    add         ecx,        edx
159    movzx       edx,        byte [rsi+rdi]
160    add         ecx,        edx
161
162    ; add up
163    pextrw      edx,        mm1, 0x0
164    lea         edx,        [edx+ecx+8]
165    sar         edx,        4
166    movd        mm1,        edx
167    movsxd      rcx,        dword ptr arg(1) ;dst_stride
168    pshufw      mm1,        mm1, 0x0
169    mov         rdi,        arg(0) ;dst;
170    packuswb    mm1,        mm1
171
172    ; write out
173    lea         rax,        [rcx*3]
174    lea         rdx,        [rdi+rcx*4]
175
176    movq [rdi      ],       mm1
177    movq [rdi+rcx  ],       mm1
178    movq [rdi+rcx*2],       mm1
179    movq [rdi+rax  ],       mm1
180    movq [rdx      ],       mm1
181    movq [rdx+rcx  ],       mm1
182    movq [rdx+rcx*2],       mm1
183    movq [rdx+rax  ],       mm1
184
185    ; begin epilog
186    pop         rdi
187    pop         rsi
188    UNSHADOW_ARGS
189    pop         rbp
190    ret
191
192;void vp8_intra_pred_uv_dctop_mmx2(
193;    unsigned char *dst,
194;    int dst_stride
195;    unsigned char *above,
196;    unsigned char *left,
197;    int left_stride,
198;    )
199global sym(vp8_intra_pred_uv_dctop_mmx2) PRIVATE
200sym(vp8_intra_pred_uv_dctop_mmx2):
201    push        rbp
202    mov         rbp, rsp
203    SHADOW_ARGS_TO_STACK 5
204    GET_GOT     rbx
205    push        rsi
206    push        rdi
207    ; end prolog
208
209    ;arg(3), arg(4) not used
210
211    ; from top
212    mov         rsi,        arg(2) ;above;
213    pxor        mm0,        mm0
214    movq        mm1,        [rsi]
215    psadbw      mm1,        mm0
216
217    ; add up
218    paddw       mm1,        [GLOBAL(dc_4)]
219    psraw       mm1,        3
220    pshufw      mm1,        mm1, 0x0
221    packuswb    mm1,        mm1
222
223    ; write out
224    mov         rdi,        arg(0) ;dst;
225    movsxd      rcx,        dword ptr arg(1) ;dst_stride
226    lea         rax,        [rcx*3]
227
228    movq [rdi      ],       mm1
229    movq [rdi+rcx  ],       mm1
230    movq [rdi+rcx*2],       mm1
231    movq [rdi+rax  ],       mm1
232    lea         rdi,        [rdi+rcx*4]
233    movq [rdi      ],       mm1
234    movq [rdi+rcx  ],       mm1
235    movq [rdi+rcx*2],       mm1
236    movq [rdi+rax  ],       mm1
237
238    ; begin epilog
239    pop         rdi
240    pop         rsi
241    RESTORE_GOT
242    UNSHADOW_ARGS
243    pop         rbp
244    ret
245
246;void vp8_intra_pred_uv_dcleft_mmx2(
247;    unsigned char *dst,
248;    int dst_stride
249;    unsigned char *above,
250;    unsigned char *left,
251;    int left_stride,
252;    )
253global sym(vp8_intra_pred_uv_dcleft_mmx2) PRIVATE
254sym(vp8_intra_pred_uv_dcleft_mmx2):
255    push        rbp
256    mov         rbp, rsp
257    SHADOW_ARGS_TO_STACK 5
258    push        rsi
259    push        rdi
260    ; end prolog
261
262    ;arg(2) not used
263
264    ; from left
265    mov         rsi,        arg(3) ;left;
266    movsxd      rax,        dword ptr arg(4) ;left_stride;
267    lea         rdi,        [rax*3]
268    movzx       ecx,        byte [rsi]
269    movzx       edx,        byte [rsi+rax]
270    add         ecx,        edx
271    movzx       edx,        byte [rsi+rax*2]
272    add         ecx,        edx
273    movzx       edx,        byte [rsi+rdi]
274    add         ecx,        edx
275    lea         rsi,        [rsi+rax*4]
276    movzx       edx,        byte [rsi]
277    add         ecx,        edx
278    movzx       edx,        byte [rsi+rax]
279    add         ecx,        edx
280    movzx       edx,        byte [rsi+rax*2]
281    add         ecx,        edx
282    movzx       edx,        byte [rsi+rdi]
283    lea         edx,        [ecx+edx+4]
284
285    ; add up
286    shr         edx,        3
287    movd        mm1,        edx
288    pshufw      mm1,        mm1, 0x0
289    packuswb    mm1,        mm1
290
291    ; write out
292    mov         rdi,        arg(0) ;dst;
293    movsxd      rcx,        dword ptr arg(1) ;dst_stride
294    lea         rax,        [rcx*3]
295
296    movq [rdi      ],       mm1
297    movq [rdi+rcx  ],       mm1
298    movq [rdi+rcx*2],       mm1
299    movq [rdi+rax  ],       mm1
300    lea         rdi,        [rdi+rcx*4]
301    movq [rdi      ],       mm1
302    movq [rdi+rcx  ],       mm1
303    movq [rdi+rcx*2],       mm1
304    movq [rdi+rax  ],       mm1
305
306    ; begin epilog
307    pop         rdi
308    pop         rsi
309    UNSHADOW_ARGS
310    pop         rbp
311    ret
312
313;void vp8_intra_pred_uv_dc128_mmx(
314;    unsigned char *dst,
315;    int dst_stride
316;    unsigned char *above,
317;    unsigned char *left,
318;    int left_stride,
319;    )
320global sym(vp8_intra_pred_uv_dc128_mmx) PRIVATE
321sym(vp8_intra_pred_uv_dc128_mmx):
322    push        rbp
323    mov         rbp, rsp
324    SHADOW_ARGS_TO_STACK 5
325    GET_GOT     rbx
326    ; end prolog
327
328    ;arg(2), arg(3), arg(4) not used
329
330    ; write out
331    movq        mm1,        [GLOBAL(dc_128)]
332    mov         rax,        arg(0) ;dst;
333    movsxd      rdx,        dword ptr arg(1) ;dst_stride
334    lea         rcx,        [rdx*3]
335
336    movq [rax      ],       mm1
337    movq [rax+rdx  ],       mm1
338    movq [rax+rdx*2],       mm1
339    movq [rax+rcx  ],       mm1
340    lea         rax,        [rax+rdx*4]
341    movq [rax      ],       mm1
342    movq [rax+rdx  ],       mm1
343    movq [rax+rdx*2],       mm1
344    movq [rax+rcx  ],       mm1
345
346    ; begin epilog
347    RESTORE_GOT
348    UNSHADOW_ARGS
349    pop         rbp
350    ret
351
352;void vp8_intra_pred_uv_tm_sse2(
353;    unsigned char *dst,
354;    int dst_stride
355;    unsigned char *above,
356;    unsigned char *left,
357;    int left_stride,
358;    )
359%macro vp8_intra_pred_uv_tm 1
360global sym(vp8_intra_pred_uv_tm_%1) PRIVATE
361sym(vp8_intra_pred_uv_tm_%1):
362    push        rbp
363    mov         rbp, rsp
364    SHADOW_ARGS_TO_STACK 5
365    GET_GOT     rbx
366    push        rsi
367    push        rdi
368    ; end prolog
369
370    ; read top row
371    mov         edx,        4
372    mov         rsi,        arg(2) ;above
373    movsxd      rax,        dword ptr arg(4) ;left_stride;
374    pxor        xmm0,       xmm0
375%ifidn %1, ssse3
376    movdqa      xmm2,       [GLOBAL(dc_1024)]
377%endif
378    movq        xmm1,       [rsi]
379    punpcklbw   xmm1,       xmm0
380
381    ; set up left ptrs ans subtract topleft
382    movd        xmm3,       [rsi-1]
383    mov         rsi,        arg(3) ;left;
384%ifidn %1, sse2
385    punpcklbw   xmm3,       xmm0
386    pshuflw     xmm3,       xmm3, 0x0
387    punpcklqdq  xmm3,       xmm3
388%else
389    pshufb      xmm3,       xmm2
390%endif
391    psubw       xmm1,       xmm3
392
393    ; set up dest ptrs
394    mov         rdi,        arg(0) ;dst;
395    movsxd      rcx,        dword ptr arg(1) ;dst_stride
396
397.vp8_intra_pred_uv_tm_%1_loop:
398    movd        xmm3,       [rsi]
399    movd        xmm5,       [rsi+rax]
400%ifidn %1, sse2
401    punpcklbw   xmm3,       xmm0
402    punpcklbw   xmm5,       xmm0
403    pshuflw     xmm3,       xmm3, 0x0
404    pshuflw     xmm5,       xmm5, 0x0
405    punpcklqdq  xmm3,       xmm3
406    punpcklqdq  xmm5,       xmm5
407%else
408    pshufb      xmm3,       xmm2
409    pshufb      xmm5,       xmm2
410%endif
411    paddw       xmm3,       xmm1
412    paddw       xmm5,       xmm1
413    packuswb    xmm3,       xmm5
414    movq  [rdi    ],        xmm3
415    movhps[rdi+rcx],        xmm3
416    lea         rsi,        [rsi+rax*2]
417    lea         rdi,        [rdi+rcx*2]
418    dec         edx
419    jnz .vp8_intra_pred_uv_tm_%1_loop
420
421    ; begin epilog
422    pop         rdi
423    pop         rsi
424    RESTORE_GOT
425    UNSHADOW_ARGS
426    pop         rbp
427    ret
428%endmacro
429
430vp8_intra_pred_uv_tm sse2
431vp8_intra_pred_uv_tm ssse3
432
433;void vp8_intra_pred_uv_ve_mmx(
434;    unsigned char *dst,
435;    int dst_stride
436;    unsigned char *above,
437;    unsigned char *left,
438;    int left_stride,
439;    )
440global sym(vp8_intra_pred_uv_ve_mmx) PRIVATE
441sym(vp8_intra_pred_uv_ve_mmx):
442    push        rbp
443    mov         rbp, rsp
444    SHADOW_ARGS_TO_STACK 5
445    ; end prolog
446
447    ; arg(3), arg(4) not used
448
449    ; read from top
450    mov         rax,        arg(2) ;src;
451
452    movq        mm1,        [rax]
453
454    ; write out
455    mov         rax,        arg(0) ;dst;
456    movsxd      rdx,        dword ptr arg(1) ;dst_stride
457    lea         rcx,        [rdx*3]
458
459    movq [rax      ],       mm1
460    movq [rax+rdx  ],       mm1
461    movq [rax+rdx*2],       mm1
462    movq [rax+rcx  ],       mm1
463    lea         rax,        [rax+rdx*4]
464    movq [rax      ],       mm1
465    movq [rax+rdx  ],       mm1
466    movq [rax+rdx*2],       mm1
467    movq [rax+rcx  ],       mm1
468
469    ; begin epilog
470    UNSHADOW_ARGS
471    pop         rbp
472    ret
473
474;void vp8_intra_pred_uv_ho_mmx2(
475;    unsigned char *dst,
476;    int dst_stride
477;    unsigned char *above,
478;    unsigned char *left,
479;    int left_stride
480;    )
481%macro vp8_intra_pred_uv_ho 1
482global sym(vp8_intra_pred_uv_ho_%1) PRIVATE
483sym(vp8_intra_pred_uv_ho_%1):
484    push        rbp
485    mov         rbp, rsp
486    SHADOW_ARGS_TO_STACK 5
487    push        rsi
488    push        rdi
489%ifidn %1, ssse3
490%ifndef GET_GOT_SAVE_ARG
491    push        rbx
492%endif
493    GET_GOT     rbx
494%endif
495    ; end prolog
496
497    ;arg(2) not used
498
499    ; read from left and write out
500%ifidn %1, mmx2
501    mov         edx,        4
502%endif
503    mov         rsi,        arg(3) ;left
504    movsxd      rax,        dword ptr arg(4) ;left_stride;
505    mov         rdi,        arg(0) ;dst;
506    movsxd      rcx,        dword ptr arg(1) ;dst_stride
507%ifidn %1, ssse3
508    lea         rdx,        [rcx*3]
509    movdqa      xmm2,       [GLOBAL(dc_00001111)]
510    lea         rbx,        [rax*3]
511%endif
512
513%ifidn %1, mmx2
514.vp8_intra_pred_uv_ho_%1_loop:
515    movd        mm0,        [rsi]
516    movd        mm1,        [rsi+rax]
517    punpcklbw   mm0,        mm0
518    punpcklbw   mm1,        mm1
519    pshufw      mm0,        mm0, 0x0
520    pshufw      mm1,        mm1, 0x0
521    movq  [rdi    ],        mm0
522    movq  [rdi+rcx],        mm1
523    lea         rsi,        [rsi+rax*2]
524    lea         rdi,        [rdi+rcx*2]
525    dec         edx
526    jnz .vp8_intra_pred_uv_ho_%1_loop
527%else
528    movd        xmm0,       [rsi]
529    movd        xmm3,       [rsi+rax]
530    movd        xmm1,       [rsi+rax*2]
531    movd        xmm4,       [rsi+rbx]
532    punpcklbw   xmm0,       xmm3
533    punpcklbw   xmm1,       xmm4
534    pshufb      xmm0,       xmm2
535    pshufb      xmm1,       xmm2
536    movq   [rdi    ],       xmm0
537    movhps [rdi+rcx],       xmm0
538    movq [rdi+rcx*2],       xmm1
539    movhps [rdi+rdx],       xmm1
540    lea         rsi,        [rsi+rax*4]
541    lea         rdi,        [rdi+rcx*4]
542    movd        xmm0,       [rsi]
543    movd        xmm3,       [rsi+rax]
544    movd        xmm1,       [rsi+rax*2]
545    movd        xmm4,       [rsi+rbx]
546    punpcklbw   xmm0,       xmm3
547    punpcklbw   xmm1,       xmm4
548    pshufb      xmm0,       xmm2
549    pshufb      xmm1,       xmm2
550    movq   [rdi    ],       xmm0
551    movhps [rdi+rcx],       xmm0
552    movq [rdi+rcx*2],       xmm1
553    movhps [rdi+rdx],       xmm1
554%endif
555
556    ; begin epilog
557%ifidn %1, ssse3
558    RESTORE_GOT
559%ifndef GET_GOT_SAVE_ARG
560    pop         rbx
561%endif
562%endif
563    pop         rdi
564    pop         rsi
565    UNSHADOW_ARGS
566    pop         rbp
567    ret
568%endmacro
569
570vp8_intra_pred_uv_ho mmx2
571vp8_intra_pred_uv_ho ssse3
572
573;void vp8_intra_pred_y_dc_sse2(
574;    unsigned char *dst,
575;    int dst_stride
576;    unsigned char *above,
577;    unsigned char *left,
578;    int left_stride
579;    )
580global sym(vp8_intra_pred_y_dc_sse2) PRIVATE
581sym(vp8_intra_pred_y_dc_sse2):
582    push        rbp
583    mov         rbp, rsp
584    SHADOW_ARGS_TO_STACK 5
585    push        rsi
586    push        rdi
587    ; end prolog
588
589    ; from top
590    mov         rdi,        arg(2) ;above
591    mov         rsi,        arg(3) ;left
592    movsxd      rax,        dword ptr arg(4) ;left_stride;
593
594    pxor        xmm0,       xmm0
595    movdqa      xmm1,       [rdi]
596    psadbw      xmm1,       xmm0
597    movq        xmm2,       xmm1
598    punpckhqdq  xmm1,       xmm1
599    paddw       xmm1,       xmm2
600
601    ; from left
602    lea         rdi,        [rax*3]
603
604    movzx       ecx,        byte [rsi]
605    movzx       edx,        byte [rsi+rax]
606    add         ecx,        edx
607    movzx       edx,        byte [rsi+rax*2]
608    add         ecx,        edx
609    movzx       edx,        byte [rsi+rdi]
610    add         ecx,        edx
611    lea         rsi,        [rsi+rax*4]
612
613    movzx       edx,        byte [rsi]
614    add         ecx,        edx
615    movzx       edx,        byte [rsi+rax]
616    add         ecx,        edx
617    movzx       edx,        byte [rsi+rax*2]
618    add         ecx,        edx
619    movzx       edx,        byte [rsi+rdi]
620    add         ecx,        edx
621    lea         rsi,        [rsi+rax*4]
622
623    movzx       edx,        byte [rsi]
624    add         ecx,        edx
625    movzx       edx,        byte [rsi+rax]
626    add         ecx,        edx
627    movzx       edx,        byte [rsi+rax*2]
628    add         ecx,        edx
629    movzx       edx,        byte [rsi+rdi]
630    add         ecx,        edx
631    lea         rsi,        [rsi+rax*4]
632
633    movzx       edx,        byte [rsi]
634    add         ecx,        edx
635    movzx       edx,        byte [rsi+rax]
636    add         ecx,        edx
637    movzx       edx,        byte [rsi+rax*2]
638    add         ecx,        edx
639    movzx       edx,        byte [rsi+rdi]
640    add         ecx,        edx
641
642    ; add up
643    pextrw      edx,        xmm1, 0x0
644    lea         edx,        [edx+ecx+16]
645    sar         edx,        5
646    movd        xmm1,       edx
647    ; FIXME use pshufb for ssse3 version
648    pshuflw     xmm1,       xmm1, 0x0
649    punpcklqdq  xmm1,       xmm1
650    packuswb    xmm1,       xmm1
651
652    ; write out
653    mov         rsi,        2
654    mov         rdi,        arg(0) ;dst;
655    movsxd      rcx,        dword ptr arg(1) ;dst_stride
656    lea         rax,        [rcx*3]
657
658.label
659    movdqa [rdi      ],     xmm1
660    movdqa [rdi+rcx  ],     xmm1
661    movdqa [rdi+rcx*2],     xmm1
662    movdqa [rdi+rax  ],     xmm1
663    lea         rdi,        [rdi+rcx*4]
664    movdqa [rdi      ],     xmm1
665    movdqa [rdi+rcx  ],     xmm1
666    movdqa [rdi+rcx*2],     xmm1
667    movdqa [rdi+rax  ],     xmm1
668    lea         rdi,        [rdi+rcx*4]
669    dec         rsi
670    jnz .label
671
672    ; begin epilog
673    pop         rdi
674    pop         rsi
675    UNSHADOW_ARGS
676    pop         rbp
677    ret
678
679;void vp8_intra_pred_y_dctop_sse2(
680;    unsigned char *dst,
681;    int dst_stride
682;    unsigned char *above,
683;    unsigned char *left,
684;    int left_stride
685;    )
686global sym(vp8_intra_pred_y_dctop_sse2) PRIVATE
687sym(vp8_intra_pred_y_dctop_sse2):
688    push        rbp
689    mov         rbp, rsp
690    SHADOW_ARGS_TO_STACK 5
691    push        rsi
692    GET_GOT     rbx
693    ; end prolog
694
695    ;arg(3), arg(4) not used
696
697    ; from top
698    mov         rcx,        arg(2) ;above;
699    pxor        xmm0,       xmm0
700    movdqa      xmm1,       [rcx]
701    psadbw      xmm1,       xmm0
702    movdqa      xmm2,       xmm1
703    punpckhqdq  xmm1,       xmm1
704    paddw       xmm1,       xmm2
705
706    ; add up
707    paddw       xmm1,       [GLOBAL(dc_8)]
708    psraw       xmm1,       4
709    ; FIXME use pshufb for ssse3 version
710    pshuflw     xmm1,       xmm1, 0x0
711    punpcklqdq  xmm1,       xmm1
712    packuswb    xmm1,       xmm1
713
714    ; write out
715    mov         rsi,        2
716    mov         rdx,        arg(0) ;dst;
717    movsxd      rcx,        dword ptr arg(1) ;dst_stride
718    lea         rax,        [rcx*3]
719
720.label
721    movdqa [rdx      ],     xmm1
722    movdqa [rdx+rcx  ],     xmm1
723    movdqa [rdx+rcx*2],     xmm1
724    movdqa [rdx+rax  ],     xmm1
725    lea         rdx,        [rdx+rcx*4]
726    movdqa [rdx      ],     xmm1
727    movdqa [rdx+rcx  ],     xmm1
728    movdqa [rdx+rcx*2],     xmm1
729    movdqa [rdx+rax  ],     xmm1
730    lea         rdx,        [rdx+rcx*4]
731    dec         rsi
732    jnz .label
733
734    ; begin epilog
735    RESTORE_GOT
736    pop         rsi
737    UNSHADOW_ARGS
738    pop         rbp
739    ret
740
741;void vp8_intra_pred_y_dcleft_sse2(
742;    unsigned char *dst,
743;    int dst_stride
744;    unsigned char *above,
745;    unsigned char *left,
746;    int left_stride
747;    )
748global sym(vp8_intra_pred_y_dcleft_sse2) PRIVATE
749sym(vp8_intra_pred_y_dcleft_sse2):
750    push        rbp
751    mov         rbp, rsp
752    SHADOW_ARGS_TO_STACK 5
753    push        rsi
754    push        rdi
755    ; end prolog
756
757    ;arg(2) not used
758
759    ; from left
760    mov         rsi,        arg(3) ;left;
761    movsxd      rax,        dword ptr arg(4) ;left_stride;
762
763    lea         rdi,        [rax*3]
764    movzx       ecx,        byte [rsi]
765    movzx       edx,        byte [rsi+rax]
766    add         ecx,        edx
767    movzx       edx,        byte [rsi+rax*2]
768    add         ecx,        edx
769    movzx       edx,        byte [rsi+rdi]
770    add         ecx,        edx
771    lea         rsi,        [rsi+rax*4]
772    movzx       edx,        byte [rsi]
773    add         ecx,        edx
774    movzx       edx,        byte [rsi+rax]
775    add         ecx,        edx
776    movzx       edx,        byte [rsi+rax*2]
777    add         ecx,        edx
778    movzx       edx,        byte [rsi+rdi]
779    add         ecx,        edx
780    lea         rsi,        [rsi+rax*4]
781    movzx       edx,        byte [rsi]
782    add         ecx,        edx
783    movzx       edx,        byte [rsi+rax]
784    add         ecx,        edx
785    movzx       edx,        byte [rsi+rax*2]
786    add         ecx,        edx
787    movzx       edx,        byte [rsi+rdi]
788    add         ecx,        edx
789    lea         rsi,        [rsi+rax*4]
790    movzx       edx,        byte [rsi]
791    add         ecx,        edx
792    movzx       edx,        byte [rsi+rax]
793    add         ecx,        edx
794    movzx       edx,        byte [rsi+rax*2]
795    add         ecx,        edx
796    movzx       edx,        byte [rsi+rdi]
797    lea         edx,        [ecx+edx+8]
798
799    ; add up
800    shr         edx,        4
801    movd        xmm1,       edx
802    ; FIXME use pshufb for ssse3 version
803    pshuflw     xmm1,       xmm1, 0x0
804    punpcklqdq  xmm1,       xmm1
805    packuswb    xmm1,       xmm1
806
807    ; write out
808    mov         rsi,        2
809    mov         rdi,        arg(0) ;dst;
810    movsxd      rcx,        dword ptr arg(1) ;dst_stride
811    lea         rax,        [rcx*3]
812
813.label
814    movdqa [rdi      ],     xmm1
815    movdqa [rdi+rcx  ],     xmm1
816    movdqa [rdi+rcx*2],     xmm1
817    movdqa [rdi+rax  ],     xmm1
818    lea         rdi,        [rdi+rcx*4]
819    movdqa [rdi      ],     xmm1
820    movdqa [rdi+rcx  ],     xmm1
821    movdqa [rdi+rcx*2],     xmm1
822    movdqa [rdi+rax  ],     xmm1
823    lea         rdi,        [rdi+rcx*4]
824    dec         rsi
825    jnz .label
826
827    ; begin epilog
828    pop         rdi
829    pop         rsi
830    UNSHADOW_ARGS
831    pop         rbp
832    ret
833
834;void vp8_intra_pred_y_dc128_sse2(
835;    unsigned char *dst,
836;    int dst_stride
837;    unsigned char *above,
838;    unsigned char *left,
839;    int left_stride
840;    )
841global sym(vp8_intra_pred_y_dc128_sse2) PRIVATE
842sym(vp8_intra_pred_y_dc128_sse2):
843    push        rbp
844    mov         rbp, rsp
845    SHADOW_ARGS_TO_STACK 5
846    push        rsi
847    GET_GOT     rbx
848    ; end prolog
849
850    ;arg(2), arg(3), arg(4) not used
851
852    ; write out
853    mov         rsi,        2
854    movdqa      xmm1,       [GLOBAL(dc_128)]
855    mov         rax,        arg(0) ;dst;
856    movsxd      rdx,        dword ptr arg(1) ;dst_stride
857    lea         rcx,        [rdx*3]
858
859.label
860    movdqa [rax      ],     xmm1
861    movdqa [rax+rdx  ],     xmm1
862    movdqa [rax+rdx*2],     xmm1
863    movdqa [rax+rcx  ],     xmm1
864    lea         rax,        [rax+rdx*4]
865    movdqa [rax      ],     xmm1
866    movdqa [rax+rdx  ],     xmm1
867    movdqa [rax+rdx*2],     xmm1
868    movdqa [rax+rcx  ],     xmm1
869    lea         rax,        [rax+rdx*4]
870    dec         rsi
871    jnz .label
872
873    ; begin epilog
874    RESTORE_GOT
875    pop         rsi
876    UNSHADOW_ARGS
877    pop         rbp
878    ret
879
880;void vp8_intra_pred_y_tm_sse2(
881;    unsigned char *dst,
882;    int dst_stride
883;    unsigned char *above,
884;    unsigned char *left,
885;    int left_stride
886;    )
887%macro vp8_intra_pred_y_tm 1
888global sym(vp8_intra_pred_y_tm_%1) PRIVATE
889sym(vp8_intra_pred_y_tm_%1):
890    push        rbp
891    mov         rbp, rsp
892    SHADOW_ARGS_TO_STACK 5
893    SAVE_XMM 7
894    push        rsi
895    push        rdi
896    GET_GOT     rbx
897    ; end prolog
898
899    ; read top row
900    mov         edx,        8
901    mov         rsi,        arg(2) ;above
902    movsxd      rax,        dword ptr arg(4) ;left_stride;
903    pxor        xmm0,       xmm0
904%ifidn %1, ssse3
905    movdqa      xmm3,       [GLOBAL(dc_1024)]
906%endif
907    movdqa      xmm1,       [rsi]
908    movdqa      xmm2,       xmm1
909    punpcklbw   xmm1,       xmm0
910    punpckhbw   xmm2,       xmm0
911
912    ; set up left ptrs ans subtract topleft
913    movd        xmm4,       [rsi-1]
914    mov         rsi,        arg(3) ;left
915%ifidn %1, sse2
916    punpcklbw   xmm4,       xmm0
917    pshuflw     xmm4,       xmm4, 0x0
918    punpcklqdq  xmm4,       xmm4
919%else
920    pshufb      xmm4,       xmm3
921%endif
922    psubw       xmm1,       xmm4
923    psubw       xmm2,       xmm4
924
925    ; set up dest ptrs
926    mov         rdi,        arg(0) ;dst;
927    movsxd      rcx,        dword ptr arg(1) ;dst_stride
928vp8_intra_pred_y_tm_%1_loop:
929    movd        xmm4,       [rsi]
930    movd        xmm5,       [rsi+rax]
931%ifidn %1, sse2
932    punpcklbw   xmm4,       xmm0
933    punpcklbw   xmm5,       xmm0
934    pshuflw     xmm4,       xmm4, 0x0
935    pshuflw     xmm5,       xmm5, 0x0
936    punpcklqdq  xmm4,       xmm4
937    punpcklqdq  xmm5,       xmm5
938%else
939    pshufb      xmm4,       xmm3
940    pshufb      xmm5,       xmm3
941%endif
942    movdqa      xmm6,       xmm4
943    movdqa      xmm7,       xmm5
944    paddw       xmm4,       xmm1
945    paddw       xmm6,       xmm2
946    paddw       xmm5,       xmm1
947    paddw       xmm7,       xmm2
948    packuswb    xmm4,       xmm6
949    packuswb    xmm5,       xmm7
950    movdqa [rdi    ],       xmm4
951    movdqa [rdi+rcx],       xmm5
952    lea         rsi,        [rsi+rax*2]
953    lea         rdi,        [rdi+rcx*2]
954    dec         edx
955    jnz vp8_intra_pred_y_tm_%1_loop
956
957    ; begin epilog
958    RESTORE_GOT
959    pop         rdi
960    pop         rsi
961    RESTORE_XMM
962    UNSHADOW_ARGS
963    pop         rbp
964    ret
965%endmacro
966
967vp8_intra_pred_y_tm sse2
968vp8_intra_pred_y_tm ssse3
969
970;void vp8_intra_pred_y_ve_sse2(
971;    unsigned char *dst,
972;    int dst_stride
973;    unsigned char *above,
974;    unsigned char *left,
975;    int left_stride
976;    )
977global sym(vp8_intra_pred_y_ve_sse2) PRIVATE
978sym(vp8_intra_pred_y_ve_sse2):
979    push        rbp
980    mov         rbp, rsp
981    SHADOW_ARGS_TO_STACK 5
982    push        rsi
983    ; end prolog
984
985    ;arg(3), arg(4) not used
986
987    mov         rax,        arg(2) ;above;
988    mov         rsi,        2
989    movsxd      rdx,        dword ptr arg(1) ;dst_stride
990
991    ; read from top
992    movdqa      xmm1,       [rax]
993
994    ; write out
995    mov         rax,        arg(0) ;dst;
996    lea         rcx,        [rdx*3]
997
998.label
999    movdqa [rax      ],     xmm1
1000    movdqa [rax+rdx  ],     xmm1
1001    movdqa [rax+rdx*2],     xmm1
1002    movdqa [rax+rcx  ],     xmm1
1003    lea         rax,        [rax+rdx*4]
1004    movdqa [rax      ],     xmm1
1005    movdqa [rax+rdx  ],     xmm1
1006    movdqa [rax+rdx*2],     xmm1
1007    movdqa [rax+rcx  ],     xmm1
1008    lea         rax,        [rax+rdx*4]
1009    dec         rsi
1010    jnz .label
1011
1012    ; begin epilog
1013    pop         rsi
1014    UNSHADOW_ARGS
1015    pop         rbp
1016    ret
1017
1018;void vp8_intra_pred_y_ho_sse2(
1019;    unsigned char *dst,
1020;    int dst_stride
1021;    unsigned char *above,
1022;    unsigned char *left,
1023;    int left_stride,
1024;    )
1025global sym(vp8_intra_pred_y_ho_sse2) PRIVATE
1026sym(vp8_intra_pred_y_ho_sse2):
1027    push        rbp
1028    mov         rbp, rsp
1029    SHADOW_ARGS_TO_STACK 5
1030    push        rsi
1031    push        rdi
1032    ; end prolog
1033
1034    ;arg(2) not used
1035
1036    ; read from left and write out
1037    mov         edx,        8
1038    mov         rsi,        arg(3) ;left;
1039    movsxd      rax,        dword ptr arg(4) ;left_stride;
1040    mov         rdi,        arg(0) ;dst;
1041    movsxd      rcx,        dword ptr arg(1) ;dst_stride
1042
1043vp8_intra_pred_y_ho_sse2_loop:
1044    movd        xmm0,       [rsi]
1045    movd        xmm1,       [rsi+rax]
1046    ; FIXME use pshufb for ssse3 version
1047    punpcklbw   xmm0,       xmm0
1048    punpcklbw   xmm1,       xmm1
1049    pshuflw     xmm0,       xmm0, 0x0
1050    pshuflw     xmm1,       xmm1, 0x0
1051    punpcklqdq  xmm0,       xmm0
1052    punpcklqdq  xmm1,       xmm1
1053    movdqa [rdi    ],       xmm0
1054    movdqa [rdi+rcx],       xmm1
1055    lea         rsi,        [rsi+rax*2]
1056    lea         rdi,        [rdi+rcx*2]
1057    dec         edx
1058    jnz vp8_intra_pred_y_ho_sse2_loop
1059
1060    ; begin epilog
1061    pop         rdi
1062    pop         rsi
1063    UNSHADOW_ARGS
1064    pop         rbp
1065    ret
1066
1067SECTION_RODATA
1068align 16
1069dc_128:
1070    times 16 db 128
1071dc_4:
1072    times 4 dw 4
1073align 16
1074dc_8:
1075    times 8 dw 8
1076align 16
1077dc_1024:
1078    times 8 dw 0x400
1079align 16
1080dc_00001111:
1081    times 8 db 0
1082    times 8 db 1
1083