1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14 15;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q) 16global sym(vp8_dequantize_b_impl_mmx) PRIVATE 17sym(vp8_dequantize_b_impl_mmx): 18 push rbp 19 mov rbp, rsp 20 SHADOW_ARGS_TO_STACK 3 21 push rsi 22 push rdi 23 ; end prolog 24 25 mov rsi, arg(0) ;sq 26 mov rdi, arg(1) ;dq 27 mov rax, arg(2) ;q 28 29 movq mm1, [rsi] 30 pmullw mm1, [rax+0] ; mm4 *= kernel 0 modifiers. 31 movq [rdi], mm1 32 33 movq mm1, [rsi+8] 34 pmullw mm1, [rax+8] ; mm4 *= kernel 0 modifiers. 35 movq [rdi+8], mm1 36 37 movq mm1, [rsi+16] 38 pmullw mm1, [rax+16] ; mm4 *= kernel 0 modifiers. 39 movq [rdi+16], mm1 40 41 movq mm1, [rsi+24] 42 pmullw mm1, [rax+24] ; mm4 *= kernel 0 modifiers. 43 movq [rdi+24], mm1 44 45 ; begin epilog 46 pop rdi 47 pop rsi 48 UNSHADOW_ARGS 49 pop rbp 50 ret 51 52 53;void dequant_idct_add_mmx( 54;short *input, 0 55;short *dq, 1 56;unsigned char *dest, 2 57;int stride) 3 58global sym(vp8_dequant_idct_add_mmx) PRIVATE 59sym(vp8_dequant_idct_add_mmx): 60 push rbp 61 mov rbp, rsp 62 SHADOW_ARGS_TO_STACK 4 63 GET_GOT rbx 64 push rdi 65 ; end prolog 66 67 mov rax, arg(0) ;input 68 mov rdx, arg(1) ;dq 69 70 71 movq mm0, [rax ] 72 pmullw mm0, [rdx] 73 74 movq mm1, [rax +8] 75 pmullw mm1, [rdx +8] 76 77 movq mm2, [rax+16] 78 pmullw mm2, [rdx+16] 79 80 movq mm3, [rax+24] 81 pmullw mm3, [rdx+24] 82 83 mov rdx, arg(2) ;dest 84 85 pxor mm7, mm7 86 87 88 movq [rax], mm7 89 movq [rax+8], mm7 90 91 movq [rax+16],mm7 92 movq [rax+24],mm7 93 94 95 movsxd rdi, dword ptr arg(3) ;stride 96 97 psubw mm0, mm2 ; b1= 0-2 98 paddw mm2, mm2 ; 99 100 movq mm5, mm1 101 paddw mm2, mm0 ; a1 =0+2 102 103 pmulhw mm5, [GLOBAL(x_s1sqr2)]; 104 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) 105 106 movq mm7, mm3 ; 107 pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; 108 109 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) 110 psubw mm7, mm5 ; c1 111 112 movq mm5, mm1 113 movq mm4, mm3 114 115 pmulhw mm5, [GLOBAL(x_c1sqr2less1)] 116 paddw mm5, mm1 117 118 pmulhw mm3, [GLOBAL(x_s1sqr2)] 119 paddw mm3, mm4 120 121 paddw mm3, mm5 ; d1 122 movq mm6, mm2 ; a1 123 124 movq mm4, mm0 ; b1 125 paddw mm2, mm3 ;0 126 127 paddw mm4, mm7 ;1 128 psubw mm0, mm7 ;2 129 130 psubw mm6, mm3 ;3 131 132 movq mm1, mm2 ; 03 02 01 00 133 movq mm3, mm4 ; 23 22 21 20 134 135 punpcklwd mm1, mm0 ; 11 01 10 00 136 punpckhwd mm2, mm0 ; 13 03 12 02 137 138 punpcklwd mm3, mm6 ; 31 21 30 20 139 punpckhwd mm4, mm6 ; 33 23 32 22 140 141 movq mm0, mm1 ; 11 01 10 00 142 movq mm5, mm2 ; 13 03 12 02 143 144 punpckldq mm0, mm3 ; 30 20 10 00 145 punpckhdq mm1, mm3 ; 31 21 11 01 146 147 punpckldq mm2, mm4 ; 32 22 12 02 148 punpckhdq mm5, mm4 ; 33 23 13 03 149 150 movq mm3, mm5 ; 33 23 13 03 151 152 psubw mm0, mm2 ; b1= 0-2 153 paddw mm2, mm2 ; 154 155 movq mm5, mm1 156 paddw mm2, mm0 ; a1 =0+2 157 158 pmulhw mm5, [GLOBAL(x_s1sqr2)]; 159 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) 160 161 movq mm7, mm3 ; 162 pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; 163 164 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) 165 psubw mm7, mm5 ; c1 166 167 movq mm5, mm1 168 movq mm4, mm3 169 170 pmulhw mm5, [GLOBAL(x_c1sqr2less1)] 171 paddw mm5, mm1 172 173 pmulhw mm3, [GLOBAL(x_s1sqr2)] 174 paddw mm3, mm4 175 176 paddw mm3, mm5 ; d1 177 paddw mm0, [GLOBAL(fours)] 178 179 paddw mm2, [GLOBAL(fours)] 180 movq mm6, mm2 ; a1 181 182 movq mm4, mm0 ; b1 183 paddw mm2, mm3 ;0 184 185 paddw mm4, mm7 ;1 186 psubw mm0, mm7 ;2 187 188 psubw mm6, mm3 ;3 189 psraw mm2, 3 190 191 psraw mm0, 3 192 psraw mm4, 3 193 194 psraw mm6, 3 195 196 movq mm1, mm2 ; 03 02 01 00 197 movq mm3, mm4 ; 23 22 21 20 198 199 punpcklwd mm1, mm0 ; 11 01 10 00 200 punpckhwd mm2, mm0 ; 13 03 12 02 201 202 punpcklwd mm3, mm6 ; 31 21 30 20 203 punpckhwd mm4, mm6 ; 33 23 32 22 204 205 movq mm0, mm1 ; 11 01 10 00 206 movq mm5, mm2 ; 13 03 12 02 207 208 punpckldq mm0, mm3 ; 30 20 10 00 209 punpckhdq mm1, mm3 ; 31 21 11 01 210 211 punpckldq mm2, mm4 ; 32 22 12 02 212 punpckhdq mm5, mm4 ; 33 23 13 03 213 214 pxor mm7, mm7 215 216 movd mm4, [rdx] 217 punpcklbw mm4, mm7 218 paddsw mm0, mm4 219 packuswb mm0, mm7 220 movd [rdx], mm0 221 222 movd mm4, [rdx+rdi] 223 punpcklbw mm4, mm7 224 paddsw mm1, mm4 225 packuswb mm1, mm7 226 movd [rdx+rdi], mm1 227 228 movd mm4, [rdx+2*rdi] 229 punpcklbw mm4, mm7 230 paddsw mm2, mm4 231 packuswb mm2, mm7 232 movd [rdx+rdi*2], mm2 233 234 add rdx, rdi 235 236 movd mm4, [rdx+2*rdi] 237 punpcklbw mm4, mm7 238 paddsw mm5, mm4 239 packuswb mm5, mm7 240 movd [rdx+rdi*2], mm5 241 242 ; begin epilog 243 pop rdi 244 RESTORE_GOT 245 UNSHADOW_ARGS 246 pop rbp 247 ret 248 249SECTION_RODATA 250align 16 251x_s1sqr2: 252 times 4 dw 0x8A8C 253align 16 254x_c1sqr2less1: 255 times 4 dw 0x4E7B 256align 16 257fours: 258 times 4 dw 0x0004 259