1 /*
2  *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/basic_types.h"
12 
13 #include "libyuv/compare_row.h"
14 #include "libyuv/row.h"
15 
16 #ifdef __cplusplus
17 namespace libyuv {
18 extern "C" {
19 #endif
20 
21 // This module is for 32 bit Visual C x86 and clangcl
22 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
23 
24 __declspec(naked) uint32
SumSquareError_SSE2(const uint8 * src_a,const uint8 * src_b,int count)25     SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
26   __asm {
27     mov        eax, [esp + 4]  // src_a
28     mov        edx, [esp + 8]  // src_b
29     mov        ecx, [esp + 12]  // count
30     pxor       xmm0, xmm0
31     pxor       xmm5, xmm5
32 
33   wloop:
34     movdqu     xmm1, [eax]
35     lea        eax,  [eax + 16]
36     movdqu     xmm2, [edx]
37     lea        edx,  [edx + 16]
38     movdqa     xmm3, xmm1  // abs trick
39     psubusb    xmm1, xmm2
40     psubusb    xmm2, xmm3
41     por        xmm1, xmm2
42     movdqa     xmm2, xmm1
43     punpcklbw  xmm1, xmm5
44     punpckhbw  xmm2, xmm5
45     pmaddwd    xmm1, xmm1
46     pmaddwd    xmm2, xmm2
47     paddd      xmm0, xmm1
48     paddd      xmm0, xmm2
49     sub        ecx, 16
50     jg         wloop
51 
52     pshufd     xmm1, xmm0, 0xee
53     paddd      xmm0, xmm1
54     pshufd     xmm1, xmm0, 0x01
55     paddd      xmm0, xmm1
56     movd       eax, xmm0
57     ret
58   }
59 }
60 
61 // Visual C 2012 required for AVX2.
62 #if _MSC_VER >= 1700
63 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
64 #pragma warning(disable : 4752)
65 __declspec(naked) uint32
SumSquareError_AVX2(const uint8 * src_a,const uint8 * src_b,int count)66     SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
67   __asm {
68     mov        eax, [esp + 4]  // src_a
69     mov        edx, [esp + 8]  // src_b
70     mov        ecx, [esp + 12]  // count
71     vpxor      ymm0, ymm0, ymm0  // sum
72     vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
73     sub        edx, eax
74 
75   wloop:
76     vmovdqu    ymm1, [eax]
77     vmovdqu    ymm2, [eax + edx]
78     lea        eax,  [eax + 32]
79     vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
80     vpsubusb   ymm2, ymm2, ymm1
81     vpor       ymm1, ymm2, ymm3
82     vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
83     vpunpckhbw ymm1, ymm1, ymm5
84     vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
85     vpmaddwd   ymm1, ymm1, ymm1
86     vpaddd     ymm0, ymm0, ymm1
87     vpaddd     ymm0, ymm0, ymm2
88     sub        ecx, 32
89     jg         wloop
90 
91     vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
92     vpaddd     ymm0, ymm0, ymm1
93     vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
94     vpaddd     ymm0, ymm0, ymm1
95     vpermq     ymm1, ymm0, 0x02  // high + low lane.
96     vpaddd     ymm0, ymm0, ymm1
97     vmovd      eax, xmm0
98     vzeroupper
99     ret
100   }
101 }
102 #endif  // _MSC_VER >= 1700
103 
104 uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
105 uvec32 kHashMul0 = {
106     0x0c3525e1,  // 33 ^ 15
107     0xa3476dc1,  // 33 ^ 14
108     0x3b4039a1,  // 33 ^ 13
109     0x4f5f0981,  // 33 ^ 12
110 };
111 uvec32 kHashMul1 = {
112     0x30f35d61,  // 33 ^ 11
113     0x855cb541,  // 33 ^ 10
114     0x040a9121,  // 33 ^ 9
115     0x747c7101,  // 33 ^ 8
116 };
117 uvec32 kHashMul2 = {
118     0xec41d4e1,  // 33 ^ 7
119     0x4cfa3cc1,  // 33 ^ 6
120     0x025528a1,  // 33 ^ 5
121     0x00121881,  // 33 ^ 4
122 };
123 uvec32 kHashMul3 = {
124     0x00008c61,  // 33 ^ 3
125     0x00000441,  // 33 ^ 2
126     0x00000021,  // 33 ^ 1
127     0x00000001,  // 33 ^ 0
128 };
129 
130 __declspec(naked) uint32
HashDjb2_SSE41(const uint8 * src,int count,uint32 seed)131     HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
132   __asm {
133     mov        eax, [esp + 4]  // src
134     mov        ecx, [esp + 8]  // count
135     movd       xmm0, [esp + 12]  // seed
136 
137     pxor       xmm7, xmm7  // constant 0 for unpck
138     movdqa     xmm6, xmmword ptr kHash16x33
139 
140   wloop:
141     movdqu     xmm1, [eax]  // src[0-15]
142     lea        eax, [eax + 16]
143     pmulld     xmm0, xmm6  // hash *= 33 ^ 16
144     movdqa     xmm5, xmmword ptr kHashMul0
145     movdqa     xmm2, xmm1
146     punpcklbw  xmm2, xmm7  // src[0-7]
147     movdqa     xmm3, xmm2
148     punpcklwd  xmm3, xmm7  // src[0-3]
149     pmulld     xmm3, xmm5
150     movdqa     xmm5, xmmword ptr kHashMul1
151     movdqa     xmm4, xmm2
152     punpckhwd  xmm4, xmm7  // src[4-7]
153     pmulld     xmm4, xmm5
154     movdqa     xmm5, xmmword ptr kHashMul2
155     punpckhbw  xmm1, xmm7  // src[8-15]
156     movdqa     xmm2, xmm1
157     punpcklwd  xmm2, xmm7  // src[8-11]
158     pmulld     xmm2, xmm5
159     movdqa     xmm5, xmmword ptr kHashMul3
160     punpckhwd  xmm1, xmm7  // src[12-15]
161     pmulld     xmm1, xmm5
162     paddd      xmm3, xmm4  // add 16 results
163     paddd      xmm1, xmm2
164     paddd      xmm1, xmm3
165 
166     pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
167     paddd      xmm1, xmm2
168     pshufd     xmm2, xmm1, 0x01
169     paddd      xmm1, xmm2
170     paddd      xmm0, xmm1
171     sub        ecx, 16
172     jg         wloop
173 
174     movd       eax, xmm0  // return hash
175     ret
176   }
177 }
178 
179 // Visual C 2012 required for AVX2.
180 #if _MSC_VER >= 1700
181 __declspec(naked) uint32
HashDjb2_AVX2(const uint8 * src,int count,uint32 seed)182     HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
183   __asm {
184     mov        eax, [esp + 4]  // src
185     mov        ecx, [esp + 8]  // count
186     vmovd      xmm0, [esp + 12]  // seed
187 
188   wloop:
189     vpmovzxbd  xmm3, [eax]  // src[0-3]
190     vpmulld    xmm0, xmm0, xmmword ptr kHash16x33  // hash *= 33 ^ 16
191     vpmovzxbd  xmm4, [eax + 4]  // src[4-7]
192     vpmulld    xmm3, xmm3, xmmword ptr kHashMul0
193     vpmovzxbd  xmm2, [eax + 8]  // src[8-11]
194     vpmulld    xmm4, xmm4, xmmword ptr kHashMul1
195     vpmovzxbd  xmm1, [eax + 12]  // src[12-15]
196     vpmulld    xmm2, xmm2, xmmword ptr kHashMul2
197     lea        eax, [eax + 16]
198     vpmulld    xmm1, xmm1, xmmword ptr kHashMul3
199     vpaddd     xmm3, xmm3, xmm4  // add 16 results
200     vpaddd     xmm1, xmm1, xmm2
201     vpaddd     xmm1, xmm1, xmm3
202     vpshufd    xmm2, xmm1, 0x0e  // upper 2 dwords
203     vpaddd     xmm1, xmm1,xmm2
204     vpshufd    xmm2, xmm1, 0x01
205     vpaddd     xmm1, xmm1, xmm2
206     vpaddd     xmm0, xmm0, xmm1
207     sub        ecx, 16
208     jg         wloop
209 
210     vmovd      eax, xmm0  // return hash
211     vzeroupper
212     ret
213   }
214 }
215 #endif  // _MSC_VER >= 1700
216 
217 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
218 
219 #ifdef __cplusplus
220 }  // extern "C"
221 }  // namespace libyuv
222 #endif
223