1 /* 2 * Copyright 2012 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/basic_types.h" 12 13 #include "libyuv/compare_row.h" 14 #include "libyuv/row.h" 15 16 #ifdef __cplusplus 17 namespace libyuv { 18 extern "C" { 19 #endif 20 21 // This module is for 32 bit Visual C x86 and clangcl 22 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 23 24 __declspec(naked) uint32 SumSquareError_SSE2(const uint8 * src_a,const uint8 * src_b,int count)25 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { 26 __asm { 27 mov eax, [esp + 4] // src_a 28 mov edx, [esp + 8] // src_b 29 mov ecx, [esp + 12] // count 30 pxor xmm0, xmm0 31 pxor xmm5, xmm5 32 33 wloop: 34 movdqu xmm1, [eax] 35 lea eax, [eax + 16] 36 movdqu xmm2, [edx] 37 lea edx, [edx + 16] 38 movdqa xmm3, xmm1 // abs trick 39 psubusb xmm1, xmm2 40 psubusb xmm2, xmm3 41 por xmm1, xmm2 42 movdqa xmm2, xmm1 43 punpcklbw xmm1, xmm5 44 punpckhbw xmm2, xmm5 45 pmaddwd xmm1, xmm1 46 pmaddwd xmm2, xmm2 47 paddd xmm0, xmm1 48 paddd xmm0, xmm2 49 sub ecx, 16 50 jg wloop 51 52 pshufd xmm1, xmm0, 0xee 53 paddd xmm0, xmm1 54 pshufd xmm1, xmm0, 0x01 55 paddd xmm0, xmm1 56 movd eax, xmm0 57 ret 58 } 59 } 60 61 // Visual C 2012 required for AVX2. 62 #if _MSC_VER >= 1700 63 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. 64 #pragma warning(disable : 4752) 65 __declspec(naked) uint32 SumSquareError_AVX2(const uint8 * src_a,const uint8 * src_b,int count)66 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { 67 __asm { 68 mov eax, [esp + 4] // src_a 69 mov edx, [esp + 8] // src_b 70 mov ecx, [esp + 12] // count 71 vpxor ymm0, ymm0, ymm0 // sum 72 vpxor ymm5, ymm5, ymm5 // constant 0 for unpck 73 sub edx, eax 74 75 wloop: 76 vmovdqu ymm1, [eax] 77 vmovdqu ymm2, [eax + edx] 78 lea eax, [eax + 32] 79 vpsubusb ymm3, ymm1, ymm2 // abs difference trick 80 vpsubusb ymm2, ymm2, ymm1 81 vpor ymm1, ymm2, ymm3 82 vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order. 83 vpunpckhbw ymm1, ymm1, ymm5 84 vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32. 85 vpmaddwd ymm1, ymm1, ymm1 86 vpaddd ymm0, ymm0, ymm1 87 vpaddd ymm0, ymm0, ymm2 88 sub ecx, 32 89 jg wloop 90 91 vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes. 92 vpaddd ymm0, ymm0, ymm1 93 vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes. 94 vpaddd ymm0, ymm0, ymm1 95 vpermq ymm1, ymm0, 0x02 // high + low lane. 96 vpaddd ymm0, ymm0, ymm1 97 vmovd eax, xmm0 98 vzeroupper 99 ret 100 } 101 } 102 #endif // _MSC_VER >= 1700 103 104 uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 105 uvec32 kHashMul0 = { 106 0x0c3525e1, // 33 ^ 15 107 0xa3476dc1, // 33 ^ 14 108 0x3b4039a1, // 33 ^ 13 109 0x4f5f0981, // 33 ^ 12 110 }; 111 uvec32 kHashMul1 = { 112 0x30f35d61, // 33 ^ 11 113 0x855cb541, // 33 ^ 10 114 0x040a9121, // 33 ^ 9 115 0x747c7101, // 33 ^ 8 116 }; 117 uvec32 kHashMul2 = { 118 0xec41d4e1, // 33 ^ 7 119 0x4cfa3cc1, // 33 ^ 6 120 0x025528a1, // 33 ^ 5 121 0x00121881, // 33 ^ 4 122 }; 123 uvec32 kHashMul3 = { 124 0x00008c61, // 33 ^ 3 125 0x00000441, // 33 ^ 2 126 0x00000021, // 33 ^ 1 127 0x00000001, // 33 ^ 0 128 }; 129 130 __declspec(naked) uint32 HashDjb2_SSE41(const uint8 * src,int count,uint32 seed)131 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { 132 __asm { 133 mov eax, [esp + 4] // src 134 mov ecx, [esp + 8] // count 135 movd xmm0, [esp + 12] // seed 136 137 pxor xmm7, xmm7 // constant 0 for unpck 138 movdqa xmm6, xmmword ptr kHash16x33 139 140 wloop: 141 movdqu xmm1, [eax] // src[0-15] 142 lea eax, [eax + 16] 143 pmulld xmm0, xmm6 // hash *= 33 ^ 16 144 movdqa xmm5, xmmword ptr kHashMul0 145 movdqa xmm2, xmm1 146 punpcklbw xmm2, xmm7 // src[0-7] 147 movdqa xmm3, xmm2 148 punpcklwd xmm3, xmm7 // src[0-3] 149 pmulld xmm3, xmm5 150 movdqa xmm5, xmmword ptr kHashMul1 151 movdqa xmm4, xmm2 152 punpckhwd xmm4, xmm7 // src[4-7] 153 pmulld xmm4, xmm5 154 movdqa xmm5, xmmword ptr kHashMul2 155 punpckhbw xmm1, xmm7 // src[8-15] 156 movdqa xmm2, xmm1 157 punpcklwd xmm2, xmm7 // src[8-11] 158 pmulld xmm2, xmm5 159 movdqa xmm5, xmmword ptr kHashMul3 160 punpckhwd xmm1, xmm7 // src[12-15] 161 pmulld xmm1, xmm5 162 paddd xmm3, xmm4 // add 16 results 163 paddd xmm1, xmm2 164 paddd xmm1, xmm3 165 166 pshufd xmm2, xmm1, 0x0e // upper 2 dwords 167 paddd xmm1, xmm2 168 pshufd xmm2, xmm1, 0x01 169 paddd xmm1, xmm2 170 paddd xmm0, xmm1 171 sub ecx, 16 172 jg wloop 173 174 movd eax, xmm0 // return hash 175 ret 176 } 177 } 178 179 // Visual C 2012 required for AVX2. 180 #if _MSC_VER >= 1700 181 __declspec(naked) uint32 HashDjb2_AVX2(const uint8 * src,int count,uint32 seed)182 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { 183 __asm { 184 mov eax, [esp + 4] // src 185 mov ecx, [esp + 8] // count 186 vmovd xmm0, [esp + 12] // seed 187 188 wloop: 189 vpmovzxbd xmm3, [eax] // src[0-3] 190 vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16 191 vpmovzxbd xmm4, [eax + 4] // src[4-7] 192 vpmulld xmm3, xmm3, xmmword ptr kHashMul0 193 vpmovzxbd xmm2, [eax + 8] // src[8-11] 194 vpmulld xmm4, xmm4, xmmword ptr kHashMul1 195 vpmovzxbd xmm1, [eax + 12] // src[12-15] 196 vpmulld xmm2, xmm2, xmmword ptr kHashMul2 197 lea eax, [eax + 16] 198 vpmulld xmm1, xmm1, xmmword ptr kHashMul3 199 vpaddd xmm3, xmm3, xmm4 // add 16 results 200 vpaddd xmm1, xmm1, xmm2 201 vpaddd xmm1, xmm1, xmm3 202 vpshufd xmm2, xmm1, 0x0e // upper 2 dwords 203 vpaddd xmm1, xmm1,xmm2 204 vpshufd xmm2, xmm1, 0x01 205 vpaddd xmm1, xmm1, xmm2 206 vpaddd xmm0, xmm0, xmm1 207 sub ecx, 16 208 jg wloop 209 210 vmovd eax, xmm0 // return hash 211 vzeroupper 212 ret 213 } 214 } 215 #endif // _MSC_VER >= 1700 216 217 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 218 219 #ifdef __cplusplus 220 } // extern "C" 221 } // namespace libyuv 222 #endif 223