1 /*
2  *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/basic_types.h"
12 
13 #include "libyuv/compare_row.h"
14 #include "libyuv/row.h"
15 
16 #ifdef __cplusplus
17 namespace libyuv {
18 extern "C" {
19 #endif
20 
21 // This module is for GCC x86 and x64.
22 #if !defined(LIBYUV_DISABLE_X86) && \
23     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
24 
25 #if defined(__x86_64__)
HammingDistance_SSE42(const uint8_t * src_a,const uint8_t * src_b,int count)26 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
27                                const uint8_t* src_b,
28                                int count) {
29   uint64_t diff = 0u;
30 
31   asm volatile(
32       "xor        %3,%3                          \n"
33       "xor        %%r8,%%r8                      \n"
34       "xor        %%r9,%%r9                      \n"
35       "xor        %%r10,%%r10                    \n"
36 
37       // Process 32 bytes per loop.
38       LABELALIGN
39       "1:                                        \n"
40       "mov        (%0),%%rcx                     \n"
41       "mov        0x8(%0),%%rdx                  \n"
42       "xor        (%1),%%rcx                     \n"
43       "xor        0x8(%1),%%rdx                  \n"
44       "popcnt     %%rcx,%%rcx                    \n"
45       "popcnt     %%rdx,%%rdx                    \n"
46       "mov        0x10(%0),%%rsi                 \n"
47       "mov        0x18(%0),%%rdi                 \n"
48       "xor        0x10(%1),%%rsi                 \n"
49       "xor        0x18(%1),%%rdi                 \n"
50       "popcnt     %%rsi,%%rsi                    \n"
51       "popcnt     %%rdi,%%rdi                    \n"
52       "add        $0x20,%0                       \n"
53       "add        $0x20,%1                       \n"
54       "add        %%rcx,%3                       \n"
55       "add        %%rdx,%%r8                     \n"
56       "add        %%rsi,%%r9                     \n"
57       "add        %%rdi,%%r10                    \n"
58       "sub        $0x20,%2                       \n"
59       "jg         1b                             \n"
60 
61       "add        %%r8, %3                       \n"
62       "add        %%r9, %3                       \n"
63       "add        %%r10, %3                      \n"
64       : "+r"(src_a),  // %0
65         "+r"(src_b),  // %1
66         "+r"(count),  // %2
67         "=r"(diff)    // %3
68       :
69       : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
70 
71   return static_cast<uint32_t>(diff);
72 }
73 #else
74 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
75                                const uint8_t* src_b,
76                                int count) {
77   uint32_t diff = 0u;
78 
79   asm volatile(
80       // Process 16 bytes per loop.
81       LABELALIGN
82       "1:                                        \n"
83       "mov        (%0),%%ecx                     \n"
84       "mov        0x4(%0),%%edx                  \n"
85       "xor        (%1),%%ecx                     \n"
86       "xor        0x4(%1),%%edx                  \n"
87       "popcnt     %%ecx,%%ecx                    \n"
88       "add        %%ecx,%3                       \n"
89       "popcnt     %%edx,%%edx                    \n"
90       "add        %%edx,%3                       \n"
91       "mov        0x8(%0),%%ecx                  \n"
92       "mov        0xc(%0),%%edx                  \n"
93       "xor        0x8(%1),%%ecx                  \n"
94       "xor        0xc(%1),%%edx                  \n"
95       "popcnt     %%ecx,%%ecx                    \n"
96       "add        %%ecx,%3                       \n"
97       "popcnt     %%edx,%%edx                    \n"
98       "add        %%edx,%3                       \n"
99       "add        $0x10,%0                       \n"
100       "add        $0x10,%1                       \n"
101       "sub        $0x10,%2                       \n"
102       "jg         1b                             \n"
103       : "+r"(src_a),  // %0
104         "+r"(src_b),  // %1
105         "+r"(count),  // %2
106         "+r"(diff)    // %3
107       :
108       : "memory", "cc", "ecx", "edx");
109 
110   return diff;
111 }
112 #endif
113 
114 static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
115                                  15, 15, 15, 15, 15, 15, 15, 15};
116 static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
117 
HammingDistance_SSSE3(const uint8_t * src_a,const uint8_t * src_b,int count)118 uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
119                                const uint8_t* src_b,
120                                int count) {
121   uint32_t diff = 0u;
122 
123   asm volatile(
124       "movdqa     %4,%%xmm2                      \n"
125       "movdqa     %5,%%xmm3                      \n"
126       "pxor       %%xmm0,%%xmm0                  \n"
127       "pxor       %%xmm1,%%xmm1                  \n"
128       "sub        %0,%1                          \n"
129 
130       LABELALIGN
131       "1:                                        \n"
132       "movdqa     (%0),%%xmm4                    \n"
133       "movdqa     0x10(%0), %%xmm5               \n"
134       "pxor       (%0,%1), %%xmm4                \n"
135       "movdqa     %%xmm4,%%xmm6                  \n"
136       "pand       %%xmm2,%%xmm6                  \n"
137       "psrlw      $0x4,%%xmm4                    \n"
138       "movdqa     %%xmm3,%%xmm7                  \n"
139       "pshufb     %%xmm6,%%xmm7                  \n"
140       "pand       %%xmm2,%%xmm4                  \n"
141       "movdqa     %%xmm3,%%xmm6                  \n"
142       "pshufb     %%xmm4,%%xmm6                  \n"
143       "paddb      %%xmm7,%%xmm6                  \n"
144       "pxor       0x10(%0,%1),%%xmm5             \n"
145       "add        $0x20,%0                       \n"
146       "movdqa     %%xmm5,%%xmm4                  \n"
147       "pand       %%xmm2,%%xmm5                  \n"
148       "psrlw      $0x4,%%xmm4                    \n"
149       "movdqa     %%xmm3,%%xmm7                  \n"
150       "pshufb     %%xmm5,%%xmm7                  \n"
151       "pand       %%xmm2,%%xmm4                  \n"
152       "movdqa     %%xmm3,%%xmm5                  \n"
153       "pshufb     %%xmm4,%%xmm5                  \n"
154       "paddb      %%xmm7,%%xmm5                  \n"
155       "paddb      %%xmm5,%%xmm6                  \n"
156       "psadbw     %%xmm1,%%xmm6                  \n"
157       "paddd      %%xmm6,%%xmm0                  \n"
158       "sub        $0x20,%2                       \n"
159       "jg         1b                             \n"
160 
161       "pshufd     $0xaa,%%xmm0,%%xmm1            \n"
162       "paddd      %%xmm1,%%xmm0                  \n"
163       "movd       %%xmm0, %3                     \n"
164       : "+r"(src_a),       // %0
165         "+r"(src_b),       // %1
166         "+r"(count),       // %2
167         "=r"(diff)         // %3
168       : "m"(kNibbleMask),  // %4
169         "m"(kBitCount)     // %5
170       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
171         "xmm7");
172 
173   return diff;
174 }
175 
176 #ifdef HAS_HAMMINGDISTANCE_AVX2
HammingDistance_AVX2(const uint8_t * src_a,const uint8_t * src_b,int count)177 uint32_t HammingDistance_AVX2(const uint8_t* src_a,
178                               const uint8_t* src_b,
179                               int count) {
180   uint32_t diff = 0u;
181 
182   asm volatile(
183       "vbroadcastf128 %4,%%ymm2                  \n"
184       "vbroadcastf128 %5,%%ymm3                  \n"
185       "vpxor      %%ymm0,%%ymm0,%%ymm0           \n"
186       "vpxor      %%ymm1,%%ymm1,%%ymm1           \n"
187       "sub        %0,%1                          \n"
188 
189       LABELALIGN
190       "1:                                        \n"
191       "vmovdqa    (%0),%%ymm4                    \n"
192       "vmovdqa    0x20(%0), %%ymm5               \n"
193       "vpxor      (%0,%1), %%ymm4, %%ymm4        \n"
194       "vpand      %%ymm2,%%ymm4,%%ymm6           \n"
195       "vpsrlw     $0x4,%%ymm4,%%ymm4             \n"
196       "vpshufb    %%ymm6,%%ymm3,%%ymm6           \n"
197       "vpand      %%ymm2,%%ymm4,%%ymm4           \n"
198       "vpshufb    %%ymm4,%%ymm3,%%ymm4           \n"
199       "vpaddb     %%ymm4,%%ymm6,%%ymm6           \n"
200       "vpxor      0x20(%0,%1),%%ymm5,%%ymm4      \n"
201       "add        $0x40,%0                       \n"
202       "vpand      %%ymm2,%%ymm4,%%ymm5           \n"
203       "vpsrlw     $0x4,%%ymm4,%%ymm4             \n"
204       "vpshufb    %%ymm5,%%ymm3,%%ymm5           \n"
205       "vpand      %%ymm2,%%ymm4,%%ymm4           \n"
206       "vpshufb    %%ymm4,%%ymm3,%%ymm4           \n"
207       "vpaddb     %%ymm5,%%ymm4,%%ymm4           \n"
208       "vpaddb     %%ymm6,%%ymm4,%%ymm4           \n"
209       "vpsadbw    %%ymm1,%%ymm4,%%ymm4           \n"
210       "vpaddd     %%ymm0,%%ymm4,%%ymm0           \n"
211       "sub        $0x40,%2                       \n"
212       "jg         1b                             \n"
213 
214       "vpermq     $0xb1,%%ymm0,%%ymm1            \n"
215       "vpaddd     %%ymm1,%%ymm0,%%ymm0           \n"
216       "vpermq     $0xaa,%%ymm0,%%ymm1            \n"
217       "vpaddd     %%ymm1,%%ymm0,%%ymm0           \n"
218       "vmovd      %%xmm0, %3                     \n"
219       "vzeroupper                                \n"
220       : "+r"(src_a),       // %0
221         "+r"(src_b),       // %1
222         "+r"(count),       // %2
223         "=r"(diff)         // %3
224       : "m"(kNibbleMask),  // %4
225         "m"(kBitCount)     // %5
226       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
227 
228   return diff;
229 }
230 #endif  // HAS_HAMMINGDISTANCE_AVX2
231 
SumSquareError_SSE2(const uint8_t * src_a,const uint8_t * src_b,int count)232 uint32_t SumSquareError_SSE2(const uint8_t* src_a,
233                              const uint8_t* src_b,
234                              int count) {
235   uint32_t sse;
236   asm volatile(
237       "pxor      %%xmm0,%%xmm0                   \n"
238       "pxor      %%xmm5,%%xmm5                   \n"
239 
240       LABELALIGN
241       "1:                                        \n"
242       "movdqu    (%0),%%xmm1                     \n"
243       "lea       0x10(%0),%0                     \n"
244       "movdqu    (%1),%%xmm2                     \n"
245       "lea       0x10(%1),%1                     \n"
246       "movdqa    %%xmm1,%%xmm3                   \n"
247       "psubusb   %%xmm2,%%xmm1                   \n"
248       "psubusb   %%xmm3,%%xmm2                   \n"
249       "por       %%xmm2,%%xmm1                   \n"
250       "movdqa    %%xmm1,%%xmm2                   \n"
251       "punpcklbw %%xmm5,%%xmm1                   \n"
252       "punpckhbw %%xmm5,%%xmm2                   \n"
253       "pmaddwd   %%xmm1,%%xmm1                   \n"
254       "pmaddwd   %%xmm2,%%xmm2                   \n"
255       "paddd     %%xmm1,%%xmm0                   \n"
256       "paddd     %%xmm2,%%xmm0                   \n"
257       "sub       $0x10,%2                        \n"
258       "jg        1b                              \n"
259 
260       "pshufd    $0xee,%%xmm0,%%xmm1             \n"
261       "paddd     %%xmm1,%%xmm0                   \n"
262       "pshufd    $0x1,%%xmm0,%%xmm1              \n"
263       "paddd     %%xmm1,%%xmm0                   \n"
264       "movd      %%xmm0,%3                       \n"
265 
266       : "+r"(src_a),  // %0
267         "+r"(src_b),  // %1
268         "+r"(count),  // %2
269         "=g"(sse)     // %3
270         ::"memory",
271         "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
272   return sse;
273 }
274 
275 static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
276 static const uvec32 kHashMul0 = {
277     0x0c3525e1,  // 33 ^ 15
278     0xa3476dc1,  // 33 ^ 14
279     0x3b4039a1,  // 33 ^ 13
280     0x4f5f0981,  // 33 ^ 12
281 };
282 static const uvec32 kHashMul1 = {
283     0x30f35d61,  // 33 ^ 11
284     0x855cb541,  // 33 ^ 10
285     0x040a9121,  // 33 ^ 9
286     0x747c7101,  // 33 ^ 8
287 };
288 static const uvec32 kHashMul2 = {
289     0xec41d4e1,  // 33 ^ 7
290     0x4cfa3cc1,  // 33 ^ 6
291     0x025528a1,  // 33 ^ 5
292     0x00121881,  // 33 ^ 4
293 };
294 static const uvec32 kHashMul3 = {
295     0x00008c61,  // 33 ^ 3
296     0x00000441,  // 33 ^ 2
297     0x00000021,  // 33 ^ 1
298     0x00000001,  // 33 ^ 0
299 };
300 
HashDjb2_SSE41(const uint8_t * src,int count,uint32_t seed)301 uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
302   uint32_t hash;
303   asm volatile(
304       "movd      %2,%%xmm0                       \n"
305       "pxor      %%xmm7,%%xmm7                   \n"
306       "movdqa    %4,%%xmm6                       \n"
307 
308       LABELALIGN
309       "1:                                        \n"
310       "movdqu    (%0),%%xmm1                     \n"
311       "lea       0x10(%0),%0                     \n"
312       "pmulld    %%xmm6,%%xmm0                   \n"
313       "movdqa    %5,%%xmm5                       \n"
314       "movdqa    %%xmm1,%%xmm2                   \n"
315       "punpcklbw %%xmm7,%%xmm2                   \n"
316       "movdqa    %%xmm2,%%xmm3                   \n"
317       "punpcklwd %%xmm7,%%xmm3                   \n"
318       "pmulld    %%xmm5,%%xmm3                   \n"
319       "movdqa    %6,%%xmm5                       \n"
320       "movdqa    %%xmm2,%%xmm4                   \n"
321       "punpckhwd %%xmm7,%%xmm4                   \n"
322       "pmulld    %%xmm5,%%xmm4                   \n"
323       "movdqa    %7,%%xmm5                       \n"
324       "punpckhbw %%xmm7,%%xmm1                   \n"
325       "movdqa    %%xmm1,%%xmm2                   \n"
326       "punpcklwd %%xmm7,%%xmm2                   \n"
327       "pmulld    %%xmm5,%%xmm2                   \n"
328       "movdqa    %8,%%xmm5                       \n"
329       "punpckhwd %%xmm7,%%xmm1                   \n"
330       "pmulld    %%xmm5,%%xmm1                   \n"
331       "paddd     %%xmm4,%%xmm3                   \n"
332       "paddd     %%xmm2,%%xmm1                   \n"
333       "paddd     %%xmm3,%%xmm1                   \n"
334       "pshufd    $0xe,%%xmm1,%%xmm2              \n"
335       "paddd     %%xmm2,%%xmm1                   \n"
336       "pshufd    $0x1,%%xmm1,%%xmm2              \n"
337       "paddd     %%xmm2,%%xmm1                   \n"
338       "paddd     %%xmm1,%%xmm0                   \n"
339       "sub       $0x10,%1                        \n"
340       "jg        1b                              \n"
341       "movd      %%xmm0,%3                       \n"
342       : "+r"(src),        // %0
343         "+r"(count),      // %1
344         "+rm"(seed),      // %2
345         "=g"(hash)        // %3
346       : "m"(kHash16x33),  // %4
347         "m"(kHashMul0),   // %5
348         "m"(kHashMul1),   // %6
349         "m"(kHashMul2),   // %7
350         "m"(kHashMul3)    // %8
351       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
352         "xmm7");
353   return hash;
354 }
355 #endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
356 
357 #ifdef __cplusplus
358 }  // extern "C"
359 }  // namespace libyuv
360 #endif
361