1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14; /**************************************************************************** 15; * Notes: 16; * 17; * This implementation makes use of 16 bit fixed point version of two multiply 18; * constants: 19; * 1. sqrt(2) * cos (pi/8) 20; * 2. sqrt(2) * sin (pi/8) 21; * Because the first constant is bigger than 1, to maintain the same 16 bit 22; * fixed point precision as the second one, we use a trick of 23; * x * a = x + x*(a-1) 24; * so 25; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). 26; * 27; * For the second constant, because of the 16bit version is 35468, which 28; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative 29; * number. 30; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x 31; * 32; **************************************************************************/ 33 34 35;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, 36;int pitch, unsigned char *dest,int stride) 37global sym(vp8_short_idct4x4llm_mmx) PRIVATE 38sym(vp8_short_idct4x4llm_mmx): 39 push rbp 40 mov rbp, rsp 41 SHADOW_ARGS_TO_STACK 5 42 GET_GOT rbx 43 push rsi 44 push rdi 45 ; end prolog 46 47 mov rax, arg(0) ;input 48 mov rsi, arg(1) ;pred 49 50 movq mm0, [rax ] 51 movq mm1, [rax+ 8] 52 movq mm2, [rax+16] 53 movq mm3, [rax+24] 54 55%if 0 56 pxor mm7, mm7 57 movq [rax], mm7 58 movq [rax+8], mm7 59 movq [rax+16],mm7 60 movq [rax+24],mm7 61%endif 62 movsxd rax, dword ptr arg(2) ;pitch 63 mov rdx, arg(3) ;dest 64 movsxd rdi, dword ptr arg(4) ;stride 65 66 67 psubw mm0, mm2 ; b1= 0-2 68 paddw mm2, mm2 ; 69 70 movq mm5, mm1 71 paddw mm2, mm0 ; a1 =0+2 72 73 pmulhw mm5, [GLOBAL(x_s1sqr2)]; 74 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) 75 76 movq mm7, mm3 ; 77 pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; 78 79 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) 80 psubw mm7, mm5 ; c1 81 82 movq mm5, mm1 83 movq mm4, mm3 84 85 pmulhw mm5, [GLOBAL(x_c1sqr2less1)] 86 paddw mm5, mm1 87 88 pmulhw mm3, [GLOBAL(x_s1sqr2)] 89 paddw mm3, mm4 90 91 paddw mm3, mm5 ; d1 92 movq mm6, mm2 ; a1 93 94 movq mm4, mm0 ; b1 95 paddw mm2, mm3 ;0 96 97 paddw mm4, mm7 ;1 98 psubw mm0, mm7 ;2 99 100 psubw mm6, mm3 ;3 101 102 movq mm1, mm2 ; 03 02 01 00 103 movq mm3, mm4 ; 23 22 21 20 104 105 punpcklwd mm1, mm0 ; 11 01 10 00 106 punpckhwd mm2, mm0 ; 13 03 12 02 107 108 punpcklwd mm3, mm6 ; 31 21 30 20 109 punpckhwd mm4, mm6 ; 33 23 32 22 110 111 movq mm0, mm1 ; 11 01 10 00 112 movq mm5, mm2 ; 13 03 12 02 113 114 punpckldq mm0, mm3 ; 30 20 10 00 115 punpckhdq mm1, mm3 ; 31 21 11 01 116 117 punpckldq mm2, mm4 ; 32 22 12 02 118 punpckhdq mm5, mm4 ; 33 23 13 03 119 120 movq mm3, mm5 ; 33 23 13 03 121 122 psubw mm0, mm2 ; b1= 0-2 123 paddw mm2, mm2 ; 124 125 movq mm5, mm1 126 paddw mm2, mm0 ; a1 =0+2 127 128 pmulhw mm5, [GLOBAL(x_s1sqr2)]; 129 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) 130 131 movq mm7, mm3 ; 132 pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; 133 134 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) 135 psubw mm7, mm5 ; c1 136 137 movq mm5, mm1 138 movq mm4, mm3 139 140 pmulhw mm5, [GLOBAL(x_c1sqr2less1)] 141 paddw mm5, mm1 142 143 pmulhw mm3, [GLOBAL(x_s1sqr2)] 144 paddw mm3, mm4 145 146 paddw mm3, mm5 ; d1 147 paddw mm0, [GLOBAL(fours)] 148 149 paddw mm2, [GLOBAL(fours)] 150 movq mm6, mm2 ; a1 151 152 movq mm4, mm0 ; b1 153 paddw mm2, mm3 ;0 154 155 paddw mm4, mm7 ;1 156 psubw mm0, mm7 ;2 157 158 psubw mm6, mm3 ;3 159 psraw mm2, 3 160 161 psraw mm0, 3 162 psraw mm4, 3 163 164 psraw mm6, 3 165 166 movq mm1, mm2 ; 03 02 01 00 167 movq mm3, mm4 ; 23 22 21 20 168 169 punpcklwd mm1, mm0 ; 11 01 10 00 170 punpckhwd mm2, mm0 ; 13 03 12 02 171 172 punpcklwd mm3, mm6 ; 31 21 30 20 173 punpckhwd mm4, mm6 ; 33 23 32 22 174 175 movq mm0, mm1 ; 11 01 10 00 176 movq mm5, mm2 ; 13 03 12 02 177 178 punpckldq mm0, mm3 ; 30 20 10 00 179 punpckhdq mm1, mm3 ; 31 21 11 01 180 181 punpckldq mm2, mm4 ; 32 22 12 02 182 punpckhdq mm5, mm4 ; 33 23 13 03 183 184 pxor mm7, mm7 185 186 movd mm4, [rsi] 187 punpcklbw mm4, mm7 188 paddsw mm0, mm4 189 packuswb mm0, mm7 190 movd [rdx], mm0 191 192 movd mm4, [rsi+rax] 193 punpcklbw mm4, mm7 194 paddsw mm1, mm4 195 packuswb mm1, mm7 196 movd [rdx+rdi], mm1 197 198 movd mm4, [rsi+2*rax] 199 punpcklbw mm4, mm7 200 paddsw mm2, mm4 201 packuswb mm2, mm7 202 movd [rdx+rdi*2], mm2 203 204 add rdx, rdi 205 add rsi, rax 206 207 movd mm4, [rsi+2*rax] 208 punpcklbw mm4, mm7 209 paddsw mm5, mm4 210 packuswb mm5, mm7 211 movd [rdx+rdi*2], mm5 212 213 ; begin epilog 214 pop rdi 215 pop rsi 216 RESTORE_GOT 217 UNSHADOW_ARGS 218 pop rbp 219 ret 220 221;void vp8_dc_only_idct_add_mmx( 222;short input_dc, 223;unsigned char *pred_ptr, 224;int pred_stride, 225;unsigned char *dst_ptr, 226;int stride) 227global sym(vp8_dc_only_idct_add_mmx) PRIVATE 228sym(vp8_dc_only_idct_add_mmx): 229 push rbp 230 mov rbp, rsp 231 SHADOW_ARGS_TO_STACK 5 232 GET_GOT rbx 233 ; end prolog 234 235 movd mm5, arg(0) ;input_dc 236 mov rax, arg(1) ;pred_ptr 237 movsxd rdx, dword ptr arg(2) ;pred_stride 238 239 pxor mm0, mm0 240 241 paddw mm5, [GLOBAL(fours)] 242 lea rcx, [rdx + rdx*2] 243 244 psraw mm5, 3 245 246 punpcklwd mm5, mm5 247 248 punpckldq mm5, mm5 249 250 movd mm1, [rax] 251 movd mm2, [rax+rdx] 252 movd mm3, [rax+2*rdx] 253 movd mm4, [rax+rcx] 254 255 mov rax, arg(3) ;d -- destination 256 movsxd rdx, dword ptr arg(4) ;dst_stride 257 258 punpcklbw mm1, mm0 259 paddsw mm1, mm5 260 packuswb mm1, mm0 ; pack and unpack to saturate 261 lea rcx, [rdx + rdx*2] 262 263 punpcklbw mm2, mm0 264 paddsw mm2, mm5 265 packuswb mm2, mm0 ; pack and unpack to saturate 266 267 punpcklbw mm3, mm0 268 paddsw mm3, mm5 269 packuswb mm3, mm0 ; pack and unpack to saturate 270 271 punpcklbw mm4, mm0 272 paddsw mm4, mm5 273 packuswb mm4, mm0 ; pack and unpack to saturate 274 275 movd [rax], mm1 276 movd [rax+rdx], mm2 277 movd [rax+2*rdx], mm3 278 movd [rax+rcx], mm4 279 280 ; begin epilog 281 RESTORE_GOT 282 UNSHADOW_ARGS 283 pop rbp 284 ret 285 286SECTION_RODATA 287align 16 288x_s1sqr2: 289 times 4 dw 0x8A8C 290align 16 291x_c1sqr2less1: 292 times 4 dw 0x4E7B 293align 16 294fours: 295 times 4 dw 0x0004 296