1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch) 15global sym(vp8_short_fdct4x4_mmx) PRIVATE 16sym(vp8_short_fdct4x4_mmx): 17 push rbp 18 mov rbp, rsp 19 SHADOW_ARGS_TO_STACK 3 20 GET_GOT rbx 21 push rsi 22 push rdi 23 ; end prolog 24 25 mov rsi, arg(0) ; input 26 mov rdi, arg(1) ; output 27 28 movsxd rax, dword ptr arg(2) ;pitch 29 30 lea rcx, [rsi + rax*2] 31 ; read the input data 32 movq mm0, [rsi] 33 movq mm1, [rsi + rax] 34 35 movq mm2, [rcx] 36 movq mm4, [rcx + rax] 37 38 ; transpose for the first stage 39 movq mm3, mm0 ; 00 01 02 03 40 movq mm5, mm2 ; 20 21 22 23 41 42 punpcklwd mm0, mm1 ; 00 10 01 11 43 punpckhwd mm3, mm1 ; 02 12 03 13 44 45 punpcklwd mm2, mm4 ; 20 30 21 31 46 punpckhwd mm5, mm4 ; 22 32 23 33 47 48 movq mm1, mm0 ; 00 10 01 11 49 punpckldq mm0, mm2 ; 00 10 20 30 50 51 punpckhdq mm1, mm2 ; 01 11 21 31 52 53 movq mm2, mm3 ; 02 12 03 13 54 punpckldq mm2, mm5 ; 02 12 22 32 55 56 punpckhdq mm3, mm5 ; 03 13 23 33 57 58 ; mm0 0 59 ; mm1 1 60 ; mm2 2 61 ; mm3 3 62 63 ; first stage 64 movq mm5, mm0 65 movq mm4, mm1 66 67 paddw mm0, mm3 ; a1 = 0 + 3 68 paddw mm1, mm2 ; b1 = 1 + 2 69 70 psubw mm4, mm2 ; c1 = 1 - 2 71 psubw mm5, mm3 ; d1 = 0 - 3 72 73 psllw mm5, 3 74 psllw mm4, 3 75 76 psllw mm0, 3 77 psllw mm1, 3 78 79 ; output 0 and 2 80 movq mm2, mm0 ; a1 81 82 paddw mm0, mm1 ; op[0] = a1 + b1 83 psubw mm2, mm1 ; op[2] = a1 - b1 84 85 ; output 1 and 3 86 ; interleave c1, d1 87 movq mm1, mm5 ; d1 88 punpcklwd mm1, mm4 ; c1 d1 89 punpckhwd mm5, mm4 ; c1 d1 90 91 movq mm3, mm1 92 movq mm4, mm5 93 94 pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 95 pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 96 97 pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 98 pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 99 100 paddd mm1, MMWORD PTR[GLOBAL(_14500)] 101 paddd mm4, MMWORD PTR[GLOBAL(_14500)] 102 paddd mm3, MMWORD PTR[GLOBAL(_7500)] 103 paddd mm5, MMWORD PTR[GLOBAL(_7500)] 104 105 psrad mm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 106 psrad mm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 107 psrad mm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 108 psrad mm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 109 110 packssdw mm1, mm4 ; op[1] 111 packssdw mm3, mm5 ; op[3] 112 113 ; done with vertical 114 ; transpose for the second stage 115 movq mm4, mm0 ; 00 10 20 30 116 movq mm5, mm2 ; 02 12 22 32 117 118 punpcklwd mm0, mm1 ; 00 01 10 11 119 punpckhwd mm4, mm1 ; 20 21 30 31 120 121 punpcklwd mm2, mm3 ; 02 03 12 13 122 punpckhwd mm5, mm3 ; 22 23 32 33 123 124 movq mm1, mm0 ; 00 01 10 11 125 punpckldq mm0, mm2 ; 00 01 02 03 126 127 punpckhdq mm1, mm2 ; 01 22 12 13 128 129 movq mm2, mm4 ; 20 31 30 31 130 punpckldq mm2, mm5 ; 20 21 22 23 131 132 punpckhdq mm4, mm5 ; 30 31 32 33 133 134 ; mm0 0 135 ; mm1 1 136 ; mm2 2 137 ; mm3 4 138 139 movq mm5, mm0 140 movq mm3, mm1 141 142 paddw mm0, mm4 ; a1 = 0 + 3 143 paddw mm1, mm2 ; b1 = 1 + 2 144 145 psubw mm3, mm2 ; c1 = 1 - 2 146 psubw mm5, mm4 ; d1 = 0 - 3 147 148 pxor mm6, mm6 ; zero out for compare 149 150 pcmpeqw mm6, mm5 ; d1 != 0 151 152 pandn mm6, MMWORD PTR[GLOBAL(_cmp_mask)] ; clear upper, 153 ; and keep bit 0 of lower 154 155 ; output 0 and 2 156 movq mm2, mm0 ; a1 157 158 paddw mm0, mm1 ; a1 + b1 159 psubw mm2, mm1 ; a1 - b1 160 161 paddw mm0, MMWORD PTR[GLOBAL(_7w)] 162 paddw mm2, MMWORD PTR[GLOBAL(_7w)] 163 164 psraw mm0, 4 ; op[0] = (a1 + b1 + 7)>>4 165 psraw mm2, 4 ; op[8] = (a1 - b1 + 7)>>4 166 167 movq MMWORD PTR[rdi + 0 ], mm0 168 movq MMWORD PTR[rdi + 16], mm2 169 170 ; output 1 and 3 171 ; interleave c1, d1 172 movq mm1, mm5 ; d1 173 punpcklwd mm1, mm3 ; c1 d1 174 punpckhwd mm5, mm3 ; c1 d1 175 176 movq mm3, mm1 177 movq mm4, mm5 178 179 pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 180 pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 181 182 pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 183 pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 184 185 paddd mm1, MMWORD PTR[GLOBAL(_12000)] 186 paddd mm4, MMWORD PTR[GLOBAL(_12000)] 187 paddd mm3, MMWORD PTR[GLOBAL(_51000)] 188 paddd mm5, MMWORD PTR[GLOBAL(_51000)] 189 190 psrad mm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 191 psrad mm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 192 psrad mm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 193 psrad mm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 194 195 packssdw mm1, mm4 ; op[4] 196 packssdw mm3, mm5 ; op[12] 197 198 paddw mm1, mm6 ; op[4] += (d1!=0) 199 200 movq MMWORD PTR[rdi + 8 ], mm1 201 movq MMWORD PTR[rdi + 24], mm3 202 203 ; begin epilog 204 pop rdi 205 pop rsi 206 RESTORE_GOT 207 UNSHADOW_ARGS 208 pop rbp 209 ret 210 211SECTION_RODATA 212align 8 213_5352_2217: 214 dw 5352 215 dw 2217 216 dw 5352 217 dw 2217 218align 8 219_2217_neg5352: 220 dw 2217 221 dw -5352 222 dw 2217 223 dw -5352 224align 8 225_cmp_mask: 226 times 4 dw 1 227align 8 228_7w: 229 times 4 dw 7 230align 8 231_14500: 232 times 2 dd 14500 233align 8 234_7500: 235 times 2 dd 7500 236align 8 237_12000: 238 times 2 dd 12000 239align 8 240_51000: 241 times 2 dd 51000 242