1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14SECTION .text 15 16;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q) 17global sym(vp8_dequantize_b_impl_mmx) PRIVATE 18sym(vp8_dequantize_b_impl_mmx): 19 push rbp 20 mov rbp, rsp 21 SHADOW_ARGS_TO_STACK 3 22 push rsi 23 push rdi 24 ; end prolog 25 26 mov rsi, arg(0) ;sq 27 mov rdi, arg(1) ;dq 28 mov rax, arg(2) ;q 29 30 movq mm1, [rsi] 31 pmullw mm1, [rax+0] ; mm4 *= kernel 0 modifiers. 32 movq [rdi], mm1 33 34 movq mm1, [rsi+8] 35 pmullw mm1, [rax+8] ; mm4 *= kernel 0 modifiers. 36 movq [rdi+8], mm1 37 38 movq mm1, [rsi+16] 39 pmullw mm1, [rax+16] ; mm4 *= kernel 0 modifiers. 40 movq [rdi+16], mm1 41 42 movq mm1, [rsi+24] 43 pmullw mm1, [rax+24] ; mm4 *= kernel 0 modifiers. 44 movq [rdi+24], mm1 45 46 ; begin epilog 47 pop rdi 48 pop rsi 49 UNSHADOW_ARGS 50 pop rbp 51 ret 52 53 54;void dequant_idct_add_mmx( 55;short *input, 0 56;short *dq, 1 57;unsigned char *dest, 2 58;int stride) 3 59global sym(vp8_dequant_idct_add_mmx) PRIVATE 60sym(vp8_dequant_idct_add_mmx): 61 push rbp 62 mov rbp, rsp 63 SHADOW_ARGS_TO_STACK 4 64 GET_GOT rbx 65 push rdi 66 ; end prolog 67 68 mov rax, arg(0) ;input 69 mov rdx, arg(1) ;dq 70 71 72 movq mm0, [rax ] 73 pmullw mm0, [rdx] 74 75 movq mm1, [rax +8] 76 pmullw mm1, [rdx +8] 77 78 movq mm2, [rax+16] 79 pmullw mm2, [rdx+16] 80 81 movq mm3, [rax+24] 82 pmullw mm3, [rdx+24] 83 84 mov rdx, arg(2) ;dest 85 86 pxor mm7, mm7 87 88 89 movq [rax], mm7 90 movq [rax+8], mm7 91 92 movq [rax+16],mm7 93 movq [rax+24],mm7 94 95 96 movsxd rdi, dword ptr arg(3) ;stride 97 98 psubw mm0, mm2 ; b1= 0-2 99 paddw mm2, mm2 ; 100 101 movq mm5, mm1 102 paddw mm2, mm0 ; a1 =0+2 103 104 pmulhw mm5, [GLOBAL(x_s1sqr2)]; 105 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) 106 107 movq mm7, mm3 ; 108 pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; 109 110 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) 111 psubw mm7, mm5 ; c1 112 113 movq mm5, mm1 114 movq mm4, mm3 115 116 pmulhw mm5, [GLOBAL(x_c1sqr2less1)] 117 paddw mm5, mm1 118 119 pmulhw mm3, [GLOBAL(x_s1sqr2)] 120 paddw mm3, mm4 121 122 paddw mm3, mm5 ; d1 123 movq mm6, mm2 ; a1 124 125 movq mm4, mm0 ; b1 126 paddw mm2, mm3 ;0 127 128 paddw mm4, mm7 ;1 129 psubw mm0, mm7 ;2 130 131 psubw mm6, mm3 ;3 132 133 movq mm1, mm2 ; 03 02 01 00 134 movq mm3, mm4 ; 23 22 21 20 135 136 punpcklwd mm1, mm0 ; 11 01 10 00 137 punpckhwd mm2, mm0 ; 13 03 12 02 138 139 punpcklwd mm3, mm6 ; 31 21 30 20 140 punpckhwd mm4, mm6 ; 33 23 32 22 141 142 movq mm0, mm1 ; 11 01 10 00 143 movq mm5, mm2 ; 13 03 12 02 144 145 punpckldq mm0, mm3 ; 30 20 10 00 146 punpckhdq mm1, mm3 ; 31 21 11 01 147 148 punpckldq mm2, mm4 ; 32 22 12 02 149 punpckhdq mm5, mm4 ; 33 23 13 03 150 151 movq mm3, mm5 ; 33 23 13 03 152 153 psubw mm0, mm2 ; b1= 0-2 154 paddw mm2, mm2 ; 155 156 movq mm5, mm1 157 paddw mm2, mm0 ; a1 =0+2 158 159 pmulhw mm5, [GLOBAL(x_s1sqr2)]; 160 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) 161 162 movq mm7, mm3 ; 163 pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; 164 165 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) 166 psubw mm7, mm5 ; c1 167 168 movq mm5, mm1 169 movq mm4, mm3 170 171 pmulhw mm5, [GLOBAL(x_c1sqr2less1)] 172 paddw mm5, mm1 173 174 pmulhw mm3, [GLOBAL(x_s1sqr2)] 175 paddw mm3, mm4 176 177 paddw mm3, mm5 ; d1 178 paddw mm0, [GLOBAL(fours)] 179 180 paddw mm2, [GLOBAL(fours)] 181 movq mm6, mm2 ; a1 182 183 movq mm4, mm0 ; b1 184 paddw mm2, mm3 ;0 185 186 paddw mm4, mm7 ;1 187 psubw mm0, mm7 ;2 188 189 psubw mm6, mm3 ;3 190 psraw mm2, 3 191 192 psraw mm0, 3 193 psraw mm4, 3 194 195 psraw mm6, 3 196 197 movq mm1, mm2 ; 03 02 01 00 198 movq mm3, mm4 ; 23 22 21 20 199 200 punpcklwd mm1, mm0 ; 11 01 10 00 201 punpckhwd mm2, mm0 ; 13 03 12 02 202 203 punpcklwd mm3, mm6 ; 31 21 30 20 204 punpckhwd mm4, mm6 ; 33 23 32 22 205 206 movq mm0, mm1 ; 11 01 10 00 207 movq mm5, mm2 ; 13 03 12 02 208 209 punpckldq mm0, mm3 ; 30 20 10 00 210 punpckhdq mm1, mm3 ; 31 21 11 01 211 212 punpckldq mm2, mm4 ; 32 22 12 02 213 punpckhdq mm5, mm4 ; 33 23 13 03 214 215 pxor mm7, mm7 216 217 movd mm4, [rdx] 218 punpcklbw mm4, mm7 219 paddsw mm0, mm4 220 packuswb mm0, mm7 221 movd [rdx], mm0 222 223 movd mm4, [rdx+rdi] 224 punpcklbw mm4, mm7 225 paddsw mm1, mm4 226 packuswb mm1, mm7 227 movd [rdx+rdi], mm1 228 229 movd mm4, [rdx+2*rdi] 230 punpcklbw mm4, mm7 231 paddsw mm2, mm4 232 packuswb mm2, mm7 233 movd [rdx+rdi*2], mm2 234 235 add rdx, rdi 236 237 movd mm4, [rdx+2*rdi] 238 punpcklbw mm4, mm7 239 paddsw mm5, mm4 240 packuswb mm5, mm7 241 movd [rdx+rdi*2], mm5 242 243 ; begin epilog 244 pop rdi 245 RESTORE_GOT 246 UNSHADOW_ARGS 247 pop rbp 248 ret 249 250SECTION_RODATA 251align 16 252x_s1sqr2: 253 times 4 dw 0x8A8C 254align 16 255x_c1sqr2less1: 256 times 4 dw 0x4E7B 257align 16 258fours: 259 times 4 dw 0x0004 260