1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license and patent
5;  grant that can be found in the LICENSE file in the root of the source
6;  tree. All contributing project authors may be found in the AUTHORS
7;  file in the root of the source tree.
8;
9
10
11%include "vpx_ports/x86_abi_support.asm"
12%include "vp8_asm_enc_offsets.asm"
13
14
15; void vp8_fast_quantize_b_ssse3 | arg
16;  (BLOCK  *b,                   |  0
17;   BLOCKD *d)                   |  1
18;
19
20global sym(vp8_fast_quantize_b_ssse3) PRIVATE
21sym(vp8_fast_quantize_b_ssse3):
22    push        rbp
23    mov         rbp, rsp
24    GET_GOT     rbx
25
26%if ABI_IS_32BIT
27    push        rdi
28    push        rsi
29%else
30  %if LIBVPX_YASM_WIN64
31    push        rdi
32    push        rsi
33  %endif
34%endif
35    ; end prolog
36
37%if ABI_IS_32BIT
38    mov         rdi, arg(0)                 ; BLOCK *b
39    mov         rsi, arg(1)                 ; BLOCKD *d
40%else
41  %if LIBVPX_YASM_WIN64
42    mov         rdi, rcx                    ; BLOCK *b
43    mov         rsi, rdx                    ; BLOCKD *d
44  %else
45    ;mov         rdi, rdi                    ; BLOCK *b
46    ;mov         rsi, rsi                    ; BLOCKD *d
47  %endif
48%endif
49
50    mov         rax, [rdi + vp8_block_coeff]
51    mov         rcx, [rdi + vp8_block_round]
52    mov         rdx, [rdi + vp8_block_quant_fast]
53
54    ; coeff
55    movdqa      xmm0, [rax]
56    movdqa      xmm4, [rax + 16]
57
58    ; round
59    movdqa      xmm2, [rcx]
60    movdqa      xmm3, [rcx + 16]
61
62    movdqa      xmm1, xmm0
63    movdqa      xmm5, xmm4
64
65    ; sz = z >> 15
66    psraw       xmm0, 15
67    psraw       xmm4, 15
68
69    pabsw       xmm1, xmm1
70    pabsw       xmm5, xmm5
71
72    paddw       xmm1, xmm2
73    paddw       xmm5, xmm3
74
75    ; quant_fast
76    pmulhw      xmm1, [rdx]
77    pmulhw      xmm5, [rdx + 16]
78
79    mov         rax, [rsi + vp8_blockd_qcoeff]
80    mov         rdi, [rsi + vp8_blockd_dequant]
81    mov         rcx, [rsi + vp8_blockd_dqcoeff]
82
83    movdqa      xmm2, xmm1                  ;store y for getting eob
84    movdqa      xmm3, xmm5
85
86    pxor        xmm1, xmm0
87    pxor        xmm5, xmm4
88    psubw       xmm1, xmm0
89    psubw       xmm5, xmm4
90
91    movdqa      [rax], xmm1
92    movdqa      [rax + 16], xmm5
93
94    movdqa      xmm0, [rdi]
95    movdqa      xmm4, [rdi + 16]
96
97    pmullw      xmm0, xmm1
98    pmullw      xmm4, xmm5
99    pxor        xmm1, xmm1
100
101    pcmpgtw     xmm2, xmm1                  ;calculate eob
102    pcmpgtw     xmm3, xmm1
103    packsswb    xmm2, xmm3
104    pshufb      xmm2, [GLOBAL(zz_shuf)]
105
106    pmovmskb    edx, xmm2
107
108    movdqa      [rcx], xmm0                 ;store dqcoeff
109    movdqa      [rcx + 16], xmm4            ;store dqcoeff
110    mov         rcx, [rsi + vp8_blockd_eob]
111
112    bsr         eax, edx                    ;count 0
113    add         eax, 1
114
115    cmp         edx, 0                      ;if all 0, eob=0
116    cmove       eax, edx
117
118    mov         BYTE PTR [rcx], al          ;store eob
119
120    ; begin epilog
121%if ABI_IS_32BIT
122    pop         rsi
123    pop         rdi
124%else
125  %if LIBVPX_YASM_WIN64
126    pop         rsi
127    pop         rdi
128  %endif
129%endif
130
131    RESTORE_GOT
132    pop         rbp
133    ret
134
135SECTION_RODATA
136align 16
137zz_shuf:
138    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
139