1 /*
2  * Copyright (C) 2008 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /* ---- includes ----------------------------------------------------------- */
18 
19 #include "b_BasicEm/Basic.h" /* to disable some warnings in VC++ */
20 
21 #if ( defined( WIN64 ) || defined( HW_SSE2 ) )
22 
23 #include "emmintrin.h"
24 
25 /* disable warning "local variable 'x' used without having been initialized" */
26 #pragma warning( disable : 4700 )
27 
28 
29 /** Using half register (64-bit) in SSE2 to calculate dot product.
30  *  This is a SSE2 reimplementation of bbs_dotProduct_intelMMX16 in Math.c.
31  *  Dependencies: input vectors need to be 16-bit aligned
32  *  Return Value: int32 containing resultL of dot product
33  */
bbs_dotProduct_64SSE2(const int16 * vec1A,const int16 * vec2A,uint32 sizeA)34 int32 bbs_dotProduct_64SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
35 {
36 	__m128i m_XMM0, m_XMM1, m_XMM2, m_XMM3, m_XMM4, m_XMM5, m_XMM6, m_XMM7, m_XMM8;
37 	int16* vec1L = ( int16* )vec1A;
38 	int16* vec2L = ( int16* )vec2A;
39 
40 	int32 resultL = 0;
41 	uint32 alignOffSetL = 0;
42 
43 	/* initialize registers to 0 */
44 	m_XMM4 = _mm_xor_si128( m_XMM4, m_XMM4 );
45 	m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );
46 	m_XMM7 = _mm_xor_si128( m_XMM7, m_XMM7 );
47 
48 	alignOffSetL = sizeA % 16;
49 	sizeA >>= 4;
50 
51 	if( sizeA )
52 	{
53 		while( sizeA > 0 )
54 		{
55 			m_XMM0 = _mm_loadl_epi64( (__m128i *)&0[vec1L] );
56 			m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM4 );
57 
58 			m_XMM1 = _mm_loadl_epi64( (__m128i *)&0[vec2L] );
59 			m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM6 );
60 
61 			m_XMM2 = _mm_loadl_epi64( (__m128i *)&4[vec1L] );
62 
63 			m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM1 );
64 
65 			m_XMM3 = _mm_loadl_epi64( (__m128i *)&4[vec2L] );
66 			m_XMM4 = _mm_loadl_epi64( (__m128i *)&8[vec1L] );
67 
68 			m_XMM2 = _mm_madd_epi16( m_XMM2, m_XMM3 );
69 
70 			m_XMM5 = _mm_loadl_epi64( (__m128i *)&8[vec2L] );
71 
72 			m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM0 );
73 
74 			m_XMM6 = _mm_loadl_epi64( (__m128i *)&12[vec1L] );
75 
76 			m_XMM4 = _mm_madd_epi16( m_XMM4, m_XMM5 );
77 
78 			m_XMM8 = _mm_loadl_epi64( (__m128i *)&12[vec2L] );
79 			m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM8 );
80 
81 			m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM2 );
82 
83 			vec1L += 16;
84 			vec2L += 16;
85 			sizeA--;
86 		}
87 
88 		/* sum up accumulators */
89 		m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM4 );
90 
91 		m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM6 );
92 
93 		m_XMM0 = _mm_loadl_epi64( (__m128i *)&m_XMM7 );
94 
95 		m_XMM0 = _mm_srli_epi64( m_XMM0, 32 );
96 
97 		m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM0 );
98 
99 		resultL = _mm_cvtsi128_si32( m_XMM7 );
100 	}
101 
102 	/* switch statements produces faster code than loop */
103 	switch( alignOffSetL )
104 	{
105 		case 15:
106 			resultL += ( int32 )*vec1L++ * *vec2L++;
107 		case 14:
108 			resultL += ( int32 )*vec1L++ * *vec2L++;
109 		case 13:
110 			resultL += ( int32 )*vec1L++ * *vec2L++;
111 		case 12:
112 			resultL += ( int32 )*vec1L++ * *vec2L++;
113 		case 11:
114 			resultL += ( int32 )*vec1L++ * *vec2L++;
115 		case 10:
116 			resultL += ( int32 )*vec1L++ * *vec2L++;
117 		case 9:
118 			resultL += ( int32 )*vec1L++ * *vec2L++;
119 		case 8:
120 			resultL += ( int32 )*vec1L++ * *vec2L++;
121 		case 7:
122 			resultL += ( int32 )*vec1L++ * *vec2L++;
123 		case 6:
124 			resultL += ( int32 )*vec1L++ * *vec2L++;
125 		case 5:
126 			resultL += ( int32 )*vec1L++ * *vec2L++;
127 		case 4:
128 			resultL += ( int32 )*vec1L++ * *vec2L++;
129 		case 3:
130 			resultL += ( int32 )*vec1L++ * *vec2L++;
131 		case 2:
132 			resultL += ( int32 )*vec1L++ * *vec2L++;
133 		case 1:
134 			resultL += ( int32 )*vec1L++ * *vec2L++;
135 	}
136 
137 	return resultL;
138 }
139 
140 /* ------------------------------------------------------------------------- */
141 
142 /** Using full register (128-bit) in SSE2 to calculate dot Product.
143  *  Dependencies: 16-bit aligned
144  *  Return Value: int32 containing dot Product
145  */
bbs_dotProduct_128SSE2(const int16 * vec1A,const int16 * vec2A,uint32 sizeA)146 int32 bbs_dotProduct_128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
147 {
148 	__m128i m_XMM0, m_XMM2, m_XMM3, m_XMM5, m_XMM6;
149 	int16* vec1L = ( int16* )vec1A;
150 	int16* vec2L = ( int16* )vec2A;
151 
152 	int32 resultL = 0;
153 	uint32 alignOffSetL = 0;
154 
155 	m_XMM5 = _mm_xor_si128( m_XMM5, m_XMM5 );
156 	m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );
157 
158 	alignOffSetL = sizeA % 16;
159 	sizeA >>= 4;
160 
161 	if( sizeA )
162 	{
163 		while( sizeA > 0 )
164 		{
165 			m_XMM0 = _mm_load_si128( (__m128i *)&0[vec1L] );
166 			m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
167 
168 			m_XMM2 = _mm_load_si128( (__m128i *)&0[vec2L] );
169 
170 			m_XMM6 = _mm_load_si128( (__m128i *)&8[vec1L] );
171 
172 			m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM2 );
173 
174 			m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM0 );
175 
176 			m_XMM3 = _mm_load_si128( (__m128i *)&8[vec2L] );
177 
178 			m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM3 );
179 
180 			vec1L += 16;
181 			vec2L += 16;
182 			sizeA--;
183 		}
184 
185 		/* sum up accumulators */
186 		m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
187 
188 		m_XMM0 = _mm_load_si128( (__m128i *)&m_XMM5 );
189 
190 		resultL = _mm_cvtsi128_si32( m_XMM0 );	/* 1st 32bits */
191 
192 		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
193 
194 		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 2nd 32bits */
195 
196 		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
197 
198 		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 3rd 32bits */
199 
200 		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
201 
202 		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 4th 32bits */
203 	}
204 
205 	switch( alignOffSetL )
206 	{
207 		case 15:
208 			resultL += ( int32 )*vec1L++ * *vec2L++;
209 		case 14:
210 			resultL += ( int32 )*vec1L++ * *vec2L++;
211 		case 13:
212 			resultL += ( int32 )*vec1L++ * *vec2L++;
213 		case 12:
214 			resultL += ( int32 )*vec1L++ * *vec2L++;
215 		case 11:
216 			resultL += ( int32 )*vec1L++ * *vec2L++;
217 		case 10:
218 			resultL += ( int32 )*vec1L++ * *vec2L++;
219 		case 9:
220 			resultL += ( int32 )*vec1L++ * *vec2L++;
221 		case 8:
222 			resultL += ( int32 )*vec1L++ * *vec2L++;
223 		case 7:
224 			resultL += ( int32 )*vec1L++ * *vec2L++;
225 		case 6:
226 			resultL += ( int32 )*vec1L++ * *vec2L++;
227 		case 5:
228 			resultL += ( int32 )*vec1L++ * *vec2L++;
229 		case 4:
230 			resultL += ( int32 )*vec1L++ * *vec2L++;
231 		case 3:
232 			resultL += ( int32 )*vec1L++ * *vec2L++;
233 		case 2:
234 			resultL += ( int32 )*vec1L++ * *vec2L++;
235 		case 1:
236 			resultL += ( int32 )*vec1L++ * *vec2L++;
237 	}
238 
239 	return resultL;
240 }
241 
242 /* ------------------------------------------------------------------------- */
243 
244 
245 /** Using full register (128-bit) in SSE2 to calculate dot product (non aligned version).
246  *  Dependencies: memory does not need to be 16-bit aligned
247  *  Return Value: int32 containing dot product
248  */
bbs_dotProduct_u128SSE2(const int16 * vec1A,const int16 * vec2A,uint32 sizeA)249 int32 bbs_dotProduct_u128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
250 {
251 	__m128i m_XMM0, m_XMM2, m_XMM3, m_XMM5, m_XMM6;
252 	int16* vec1L = ( int16* )vec1A;
253 	int16* vec2L = ( int16* )vec2A;
254 	int32 resultL = 0;
255 	uint32 alignOffSetL = 0;
256 
257 	/* initialize registers to 0 */
258 	m_XMM5 = _mm_xor_si128( m_XMM5, m_XMM5 );
259 	m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );
260 
261 
262 	alignOffSetL = sizeA % 16;
263 	sizeA >>= 4;
264 
265 	if( sizeA )
266 	{
267 		while( sizeA > 0 )
268 		{
269 			m_XMM0 = _mm_loadu_si128( (__m128i *)&0[vec1L] );
270 			m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
271 
272 			m_XMM2 = _mm_loadu_si128( (__m128i *)&0[vec2L] );
273 
274 			m_XMM6 = _mm_loadu_si128( (__m128i *)&8[vec1L] );
275 
276 			m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM2 );
277 
278 			m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM0 );
279 
280 			m_XMM3 = _mm_loadu_si128( (__m128i *)&8[vec2L] );
281 
282 			m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM3 );
283 
284 			vec1L += 16;
285 			vec2L += 16;
286 			sizeA--;
287 		}
288 
289 		/* sum up accumulators */
290 		m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
291 
292 		m_XMM0 = _mm_loadu_si128( (__m128i *)&m_XMM5 );
293 
294 		resultL = _mm_cvtsi128_si32( m_XMM0 );	/* 1st 32bits */
295 
296 		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
297 
298 		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 2nd 32bits */
299 
300 		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
301 
302 		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 3rd 32bits */
303 
304 		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
305 
306 		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 4th 32bits */
307 	}
308 
309 
310 	switch( alignOffSetL )
311 	{
312 		case 15:
313 			resultL += ( int32 )*vec1L++ * *vec2L++;
314 		case 14:
315 			resultL += ( int32 )*vec1L++ * *vec2L++;
316 		case 13:
317 			resultL += ( int32 )*vec1L++ * *vec2L++;
318 		case 12:
319 			resultL += ( int32 )*vec1L++ * *vec2L++;
320 		case 11:
321 			resultL += ( int32 )*vec1L++ * *vec2L++;
322 		case 10:
323 			resultL += ( int32 )*vec1L++ * *vec2L++;
324 		case 9:
325 			resultL += ( int32 )*vec1L++ * *vec2L++;
326 		case 8:
327 			resultL += ( int32 )*vec1L++ * *vec2L++;
328 		case 7:
329 			resultL += ( int32 )*vec1L++ * *vec2L++;
330 		case 6:
331 			resultL += ( int32 )*vec1L++ * *vec2L++;
332 		case 5:
333 			resultL += ( int32 )*vec1L++ * *vec2L++;
334 		case 4:
335 			resultL += ( int32 )*vec1L++ * *vec2L++;
336 		case 3:
337 			resultL += ( int32 )*vec1L++ * *vec2L++;
338 		case 2:
339 			resultL += ( int32 )*vec1L++ * *vec2L++;
340 		case 1:
341 			resultL += ( int32 )*vec1L++ * *vec2L++;
342 	}
343 
344 	return resultL;
345 }
346 
347 /* ------------------------------------------------------------------------- */
348 
349 #endif /* HW_SSE2 */
350