1; 2; Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%define private_prefix vp9 12 13%include "third_party/x86inc/x86inc.asm" 14 15SECTION .text 16ALIGN 16 17 18; 19; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff, 20; intptr_t block_size, int64_t *ssz) 21; 22 23INIT_XMM avx 24cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz 25 vzeroupper 26 27 ; If only one iteration is required, then handle this as a special case. 28 ; It is the most frequent case, so we can have a significant gain here 29 ; by not setting up a loop and accumulators. 30 cmp sizeq, 16 31 jne .generic 32 33 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 34 ;; Common case of size == 16 35 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 36 37 ; Load input vectors 38 mova xm0, [dqcq] 39 packssdw xm0, [dqcq+16] 40 mova xm2, [uqcq] 41 packssdw xm2, [uqcq+16] 42 43 mova xm1, [dqcq+32] 44 packssdw xm1, [dqcq+48] 45 mova xm3, [uqcq+32] 46 packssdw xm3, [uqcq+48] 47 48 ; Compute the errors. 49 psubw xm0, xm2 50 psubw xm1, xm3 51 52 ; Individual errors are max 15bit+sign, so squares are 30bit, and 53 ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit). 54 pmaddwd xm2, xm2 55 pmaddwd xm3, xm3 56 57 pmaddwd xm0, xm0 58 pmaddwd xm1, xm1 59 60 ; Squares are always positive, so we can use unsigned arithmetic after 61 ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will 62 ; fit in 32bits 63 paddd xm2, xm3 64 paddd xm0, xm1 65 66 ; Accumulate horizontally in 64 bits, there is no chance of overflow here 67 pxor xm5, xm5 68 69 pblendw xm3, xm5, xm2, 0x33 ; Zero extended low of a pair of 32 bits 70 psrlq xm2, 32 ; Zero extended high of a pair of 32 bits 71 72 pblendw xm1, xm5, xm0, 0x33 ; Zero extended low of a pair of 32 bits 73 psrlq xm0, 32 ; Zero extended high of a pair of 32 bits 74 75 paddq xm2, xm3 76 paddq xm0, xm1 77 78 psrldq xm3, xm2, 8 79 psrldq xm1, xm0, 8 80 81 paddq xm2, xm3 82 paddq xm0, xm1 83 84 ; Store the return value 85%if ARCH_X86_64 86 movq rax, xm0 87 movq [sszq], xm2 88%else 89 movd eax, xm0 90 pextrd edx, xm0, 1 91 movq [sszd], xm2 92%endif 93 RET 94 95 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 96 ;; Generic case of size != 16, speculative low precision 97 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 98 ALIGN 16 99.generic: 100 pxor xm4, xm4 ; sse accumulator 101 pxor xm5, xm5 ; overflow detection register for xm4 102 pxor xm6, xm6 ; ssz accumulator 103 pxor xm7, xm7 ; overflow detection register for xm6 104 lea uqcq, [uqcq+sizeq*4] 105 lea dqcq, [dqcq+sizeq*4] 106 neg sizeq 107 108 ; Push the negative size as the high precision code might need it 109 push sizeq 110 111.loop: 112 ; Load input vectors 113 mova xm0, [dqcq+sizeq*4] 114 packssdw xm0, [dqcq+sizeq*4+16] 115 mova xm2, [uqcq+sizeq*4] 116 packssdw xm2, [uqcq+sizeq*4+16] 117 118 mova xm1, [dqcq+sizeq*4+32] 119 packssdw xm1, [dqcq+sizeq*4+48] 120 mova xm3, [uqcq+sizeq*4+32] 121 packssdw xm3, [uqcq+sizeq*4+48] 122 123 add sizeq, 16 124 125 ; Compute the squared errors. 126 ; Individual errors are max 15bit+sign, so squares are 30bit, and 127 ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit). 128 psubw xm0, xm2 129 pmaddwd xm2, xm2 130 pmaddwd xm0, xm0 131 132 psubw xm1, xm3 133 pmaddwd xm3, xm3 134 pmaddwd xm1, xm1 135 136 ; Squares are always positive, so we can use unsigned arithmetic after 137 ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will 138 ; fit in 32bits 139 paddd xm2, xm3 140 paddd xm0, xm1 141 142 ; We accumulate using 32 bit arithmetic, but detect potential overflow 143 ; by checking if the MSB of the accumulators have ever been a set bit. 144 ; If yes, we redo the whole compute at the end on higher precision, but 145 ; this happens extremely rarely, so we still achieve a net gain. 146 paddd xm4, xm0 147 paddd xm6, xm2 148 por xm5, xm4 ; OR in the accumulator for overflow detection 149 por xm7, xm6 ; OR in the accumulator for overflow detection 150 151 jnz .loop 152 153 ; Add pairs horizontally (still only on 32 bits) 154 phaddd xm4, xm4 155 por xm5, xm4 ; OR in the accumulator for overflow detection 156 phaddd xm6, xm6 157 por xm7, xm6 ; OR in the accumulator for overflow detection 158 159 ; Check for possibility of overflow by testing if bit 32 of each dword lane 160 ; have ever been set. If they were not, then there was no overflow and the 161 ; final sum will fit in 32 bits. If overflow happened, then 162 ; we redo the whole computation on higher precision. 163 por xm7, xm5 164 pmovmskb r4, xm7 165 test r4, 0x8888 166 jnz .highprec 167 168 phaddd xm4, xm4 169 phaddd xm6, xm6 170 pmovzxdq xm4, xm4 171 pmovzxdq xm6, xm6 172 173 ; Restore stack 174 pop sizeq 175 176 ; Store the return value 177%if ARCH_X86_64 178 movq rax, xm4 179 movq [sszq], xm6 180%else 181 movd eax, xm4 182 pextrd edx, xm4, 1 183 movq [sszd], xm6 184%endif 185 RET 186 187 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 188 ;; Generic case of size != 16, high precision case 189 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 190.highprec: 191 pxor xm4, xm4 ; sse accumulator 192 pxor xm5, xm5 ; dedicated zero register 193 pxor xm6, xm6 ; ssz accumulator 194 pop sizeq 195 196.loophp: 197 mova xm0, [dqcq+sizeq*4] 198 packssdw xm0, [dqcq+sizeq*4+16] 199 mova xm2, [uqcq+sizeq*4] 200 packssdw xm2, [uqcq+sizeq*4+16] 201 202 mova xm1, [dqcq+sizeq*4+32] 203 packssdw xm1, [dqcq+sizeq*4+48] 204 mova xm3, [uqcq+sizeq*4+32] 205 packssdw xm3, [uqcq+sizeq*4+48] 206 207 add sizeq, 16 208 209 ; individual errors are max. 15bit+sign, so squares are 30bit, and 210 ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) 211 212 psubw xm0, xm2 213 pmaddwd xm2, xm2 214 pmaddwd xm0, xm0 215 216 psubw xm1, xm3 217 pmaddwd xm3, xm3 218 pmaddwd xm1, xm1 219 220 ; accumulate in 64bit 221 punpckldq xm7, xm0, xm5 222 punpckhdq xm0, xm5 223 paddq xm4, xm7 224 225 punpckldq xm7, xm2, xm5 226 punpckhdq xm2, xm5 227 paddq xm6, xm7 228 229 punpckldq xm7, xm1, xm5 230 punpckhdq xm1, xm5 231 paddq xm4, xm7 232 233 punpckldq xm7, xm3, xm5 234 punpckhdq xm3, xm5 235 paddq xm6, xm7 236 237 paddq xm4, xm0 238 paddq xm4, xm1 239 paddq xm6, xm2 240 paddq xm6, xm3 241 242 jnz .loophp 243 244 ; Accumulate horizontally 245 movhlps xm5, xm4 246 movhlps xm7, xm6 247 paddq xm4, xm5 248 paddq xm6, xm7 249 250 ; Store the return value 251%if ARCH_X86_64 252 movq rax, xm4 253 movq [sszq], xm6 254%else 255 movd eax, xm4 256 pextrd edx, xm4, 1 257 movq [sszd], xm6 258%endif 259 RET 260 261END 262