1 /*
2  * Copyright (C) 2008 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /* ---- includes ----------------------------------------------------------- */
18 
19 #include "b_BasicEm/Math.h"
20 #include "b_BasicEm/Functions.h"
21 
22 /* ---- related objects  --------------------------------------------------- */
23 
24 /* ---- typedefs ----------------------------------------------------------- */
25 
26 /* ---- constants ---------------------------------------------------------- */
27 
28 /* ------------------------------------------------------------------------- */
29 
30 /* ========================================================================= */
31 /*                                                                           */
32 /* ---- \ghd{ external functions } ----------------------------------------- */
33 /*                                                                           */
34 /* ========================================================================= */
35 
36 #if defined( HW_SSE2 )
37 	extern int32 bbs_dotProduct_128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA );
38 	extern int32 bbs_dotProduct_u128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA );
39 #endif
40 
41 #if defined( HW_FR71 )
42 	int32 bbs_dotProduct_fr71( const int16* vec1A, const int16* vec2A, uint32 sizeA );
43 #endif
44 
45 /* ------------------------------------------------------------------------- */
46 
bbs_sqrt32(uint32 valA)47 uint16 bbs_sqrt32( uint32 valA )
48 {
49 	uint32 rootL = 0;
50 	uint32 expL = 0;
51 	expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4;
52 	expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3;
53 	expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2;
54 	expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1;
55 	switch( expL >> 1 )
56 	{
57 		case 15: rootL += ( ( rootL + 0x8000 ) * ( rootL + 0x8000 ) <= valA ) << 15;
58 		case 14: rootL += ( ( rootL + 0x4000 ) * ( rootL + 0x4000 ) <= valA ) << 14;
59 		case 13: rootL += ( ( rootL + 0x2000 ) * ( rootL + 0x2000 ) <= valA ) << 13;
60 		case 12: rootL += ( ( rootL + 0x1000 ) * ( rootL + 0x1000 ) <= valA ) << 12;
61 		case 11: rootL += ( ( rootL + 0x0800 ) * ( rootL + 0x0800 ) <= valA ) << 11;
62 		case 10: rootL += ( ( rootL + 0x0400 ) * ( rootL + 0x0400 ) <= valA ) << 10;
63 		case 9:  rootL += ( ( rootL + 0x0200 ) * ( rootL + 0x0200 ) <= valA ) << 9;
64 		case 8:  rootL += ( ( rootL + 0x0100 ) * ( rootL + 0x0100 ) <= valA ) << 8;
65 		case 7:  rootL += ( ( rootL + 0x0080 ) * ( rootL + 0x0080 ) <= valA ) << 7;
66 		case 6:  rootL += ( ( rootL + 0x0040 ) * ( rootL + 0x0040 ) <= valA ) << 6;
67 		case 5:  rootL += ( ( rootL + 0x0020 ) * ( rootL + 0x0020 ) <= valA ) << 5;
68 		case 4:  rootL += ( ( rootL + 0x0010 ) * ( rootL + 0x0010 ) <= valA ) << 4;
69 		case 3:  rootL += ( ( rootL + 0x0008 ) * ( rootL + 0x0008 ) <= valA ) << 3;
70 		case 2:  rootL += ( ( rootL + 0x0004 ) * ( rootL + 0x0004 ) <= valA ) << 2;
71 		case 1:  rootL += ( ( rootL + 0x0002 ) * ( rootL + 0x0002 ) <= valA ) << 1;
72 		case 0:  rootL += ( ( rootL + 0x0001 ) * ( rootL + 0x0001 ) <= valA ) << 0;
73 	}
74 
75 	return ( uint16 )rootL;
76 }
77 
78 /* ------------------------------------------------------------------------- */
79 
bbs_sqrt16(uint16 valA)80 uint8 bbs_sqrt16( uint16 valA )
81 {
82 	uint16 rootL = 0;
83 	rootL += ( ( rootL + 0x80 ) * ( rootL + 0x80 ) <= valA ) << 7;
84 	rootL += ( ( rootL + 0x40 ) * ( rootL + 0x40 ) <= valA ) << 6;
85 	rootL += ( ( rootL + 0x20 ) * ( rootL + 0x20 ) <= valA ) << 5;
86 	rootL += ( ( rootL + 0x10 ) * ( rootL + 0x10 ) <= valA ) << 4;
87 	rootL += ( ( rootL + 0x08 ) * ( rootL + 0x08 ) <= valA ) << 3;
88 	rootL += ( ( rootL + 0x04 ) * ( rootL + 0x04 ) <= valA ) << 2;
89 	rootL += ( ( rootL + 0x02 ) * ( rootL + 0x02 ) <= valA ) << 1;
90 	rootL += ( ( rootL + 0x01 ) * ( rootL + 0x01 ) <= valA ) << 0;
91 	return ( uint8 )rootL;
92 }
93 
94 /* ------------------------------------------------------------------------- */
95 
96 /* table of sqrt and slope values */
97 const uint32 bbs_fastSqrt32_tableG[] =
98 {
99 	268435456, 1016, 272596992, 1000, 276692992, 987, 280735744, 972,
100 	284717056, 959, 288645120, 946, 292519936, 933, 296341504, 922,
101 	300118016, 910, 303845376, 899, 307527680, 889, 311169024, 878,
102 	314765312, 869, 318324736, 858, 321839104, 850, 325320704, 840,
103 	328761344, 832, 332169216, 824, 335544320, 815, 338882560, 807,
104 	342188032, 799, 345460736, 792, 348704768, 785, 351920128, 777,
105 	355102720, 771, 358260736, 764, 361390080, 757, 364490752, 751,
106 	367566848, 745, 370618368, 739, 373645312, 732, 376643584, 727,
107 	379621376, 722, 382578688, 715, 385507328, 711, 388419584, 705,
108 	391307264, 700, 394174464, 695, 397021184, 689, 399843328, 686,
109 	402653184, 680, 405438464, 675, 408203264, 672, 410955776, 666,
110 	413683712, 663, 416399360, 658, 419094528, 653, 421769216, 650,
111 	424431616, 646, 427077632, 641, 429703168, 638, 432316416, 634,
112 	434913280, 630, 437493760, 627, 440061952, 622, 442609664, 620,
113 	445149184, 615, 447668224, 613, 450179072, 609, 452673536, 605,
114 	455151616, 602, 457617408, 600, 460075008, 595, 462512128, 593,
115 	464941056, 590, 467357696, 587, 469762048, 583, 472150016, 581,
116 	474529792, 578, 476897280, 575, 479252480, 572, 481595392, 569,
117 	483926016, 567, 486248448, 564, 488558592, 561, 490856448, 559,
118 	493146112, 556, 495423488, 553, 497688576, 552, 499949568, 548,
119 	502194176, 546, 504430592, 544, 506658816, 541, 508874752, 539,
120 	511082496, 537, 513282048, 534, 515469312, 533, 517652480, 529,
121 	519819264, 528, 521981952, 526, 524136448, 523, 526278656, 521,
122 	528412672, 519, 530538496, 517, 532656128, 515, 534765568, 514
123 };
124 
bbs_fastSqrt32(uint32 valA)125 uint16 bbs_fastSqrt32( uint32 valA )
126 {
127 	uint32 expL = 0;
128 	uint32 valL;
129 	uint32 offsL;
130 	uint32 indexL = 0;
131 
132 	if( valA == 0 ) return 0;
133 
134 	/* compute closest even size exponent of valA */
135 	expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4;
136 	expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3;
137 	expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2;
138 	expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1;
139 
140 	valL = ( ( valA << ( 30 - expL ) ) - 1073741824 ); /* ( 1 << 30 ) */
141 	offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 12 ) ) >> 13;
142 	indexL = ( valL >> 24 ) & 0xFE;
143 
144 	return ( bbs_fastSqrt32_tableG[ indexL ] + offsL * bbs_fastSqrt32_tableG[ indexL + 1 ] ) >> ( 28 - ( expL >> 1 ) );
145 }
146 
147 /* ------------------------------------------------------------------------- */
148 
149 /* table of 1/sqrt (1.31) and negative slope (.15) values
150    referenced in b_GaborCueEm/focusDispAsm.s55, do not rename or remove ! */
151 const uint32 bbs_invSqrt32_tableG[] =
152 {
153 	2147483648u, 1001, 2114682880, 956, 2083356672, 915, 2053373952, 877,
154 	2024636416, 840, 1997111296, 808, 1970634752, 776, 1945206784, 746,
155 	1920761856, 720, 1897168896, 693, 1874460672, 669, 1852538880, 646,
156 	1831370752, 625, 1810890752, 604, 1791098880, 584, 1771962368, 567,
157 	1753382912, 548, 1735426048, 533, 1717960704, 516, 1701052416, 502,
158 	1684602880, 487, 1668644864, 474, 1653112832, 461, 1638006784, 448,
159 	1623326720, 436, 1609039872, 426, 1595080704, 414, 1581514752, 404,
160 	1568276480, 394, 1555365888, 384, 1542782976, 375, 1530494976, 367,
161 	1518469120, 357, 1506770944, 350, 1495302144, 342, 1484095488, 334,
162 	1473150976, 327, 1462435840, 320, 1451950080, 313, 1441693696, 307,
163 	1431633920, 300, 1421803520, 294, 1412169728, 289, 1402699776, 282,
164 	1393459200, 277, 1384382464, 272, 1375469568, 266, 1366753280, 262,
165 	1358168064, 257, 1349746688, 251, 1341521920, 248, 1333395456, 243,
166 	1325432832, 238, 1317634048, 235, 1309933568, 230, 1302396928, 227,
167 	1294958592, 222, 1287684096, 219, 1280507904, 216, 1273430016, 211,
168 	1266515968, 209, 1259667456, 205, 1252950016, 202, 1246330880, 198,
169 	1239842816, 196, 1233420288, 192, 1227128832, 190, 1220902912, 187,
170 	1214775296, 184, 1208745984, 181, 1202814976, 179, 1196949504, 176,
171 	1191182336, 173, 1185513472, 171, 1179910144, 169, 1174372352, 166,
172 	1168932864, 164, 1163558912, 162, 1158250496, 160, 1153007616, 157,
173 	1147863040, 155, 1142784000, 154, 1137737728, 151, 1132789760, 149,
174 	1127907328, 148, 1123057664, 145, 1118306304, 144, 1113587712, 142,
175 	1108934656, 140, 1104347136, 138, 1099825152, 137, 1095335936, 135,
176 	1090912256, 134, 1086521344, 131, 1082228736, 131, 1077936128, 128
177 };
178 
bbs_invSqrt32(uint32 valA)179 uint32 bbs_invSqrt32( uint32 valA )
180 {
181 
182 	uint32 expL = 0;
183 	uint32 valL;
184 	uint32 offsL;
185 	uint32 indexL = 0;
186 
187 	if( valA == 0U ) return 0x80000000;
188 
189 	/* compute closest even size exponent of valA */
190 	expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4;
191 	expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3;
192 	expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2;
193 	expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1;
194 
195 	valL = ( ( valA << ( 30 - expL ) ) - 1073741824 ); /* ( 1 << 30 ) */
196 	offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 9 ) ) >> 10;
197 	indexL = ( valL >> 24 ) & 0xFE;
198 
199 	return ( bbs_invSqrt32_tableG[ indexL ] - offsL * bbs_invSqrt32_tableG[ indexL + 1 ] ) >> ( expL >> 1 );
200 }
201 
202 /* ------------------------------------------------------------------------- */
203 
204 /* table of 1/( x + 1 ) (2.30) and negative slope (.14) values
205    referenced in b_GaborCueEm/focusDispAsm.s55, do not rename or remove ! */
206 const int32 bbs_inv32_tableG[] =
207 {
208 	1073741824, 1986, 1041203200, 1870, 1010565120, 1762, 981696512, 1664,
209 	954433536,  1575, 928628736,  1491, 904200192,  1415, 881016832, 1345,
210 	858980352,  1278, 838041600,  1218, 818085888,  1162, 799047680, 1108,
211 	780894208,  1059, 763543552,  1013, 746946560,  970,  731054080, 930,
212 	715816960,  891,  701218816,  856,  687194112,  823,  673710080, 791,
213 	660750336,  761,  648282112,  732,  636289024,  706,  624721920, 681,
214 	613564416,  657,  602800128,  635,  592396288,  613,  582352896, 592,
215 	572653568,  573,  563265536,  554,  554188800,  537,  545390592, 520,
216 };
217 
bbs_inv32(int32 valA)218 int32 bbs_inv32( int32 valA )
219 {
220 
221 	uint32 expL = 0;
222 	int32 signL = ( ( valA >> 30 ) & 0xFFFFFFFE ) + 1;
223 	int32 valL = signL * valA;
224 	int32 offsL;
225 	uint32 indexL = 0;
226 
227 	if( valL <= ( int32 ) 1 ) return 0x40000000 * signL;
228 
229 	/* compute size exponent of valL */
230 	expL += ( ( valL >> ( expL + 0x10 ) ) != 0 ) << 4;
231 	expL += ( ( valL >> ( expL + 0x08 ) ) != 0 ) << 3;
232 	expL += ( ( valL >> ( expL + 0x04 ) ) != 0 ) << 2;
233 	expL += ( ( valL >> ( expL + 0x02 ) ) != 0 ) << 1;
234 	expL += ( ( valL >> ( expL + 0x01 ) ) != 0 );
235 
236 	valL = ( ( valL << ( 30 - expL ) ) - 1073741824 ); /*( 1U << 30 )*/
237 	offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 10 ) ) >> 11;
238 	indexL = ( valL >> 24 ) & 0xFE;
239 
240 	return signL * ( ( ( ( bbs_inv32_tableG[ indexL ] - offsL * bbs_inv32_tableG[ indexL + 1 ] ) >> ( expL - 1 ) ) + 1 ) >> 1 );
241 }
242 
243 /* ------------------------------------------------------------------------- */
244 
bbs_intLog2(uint32 valA)245 uint32 bbs_intLog2( uint32 valA )
246 {
247 	uint32 expL = 0;
248 	expL += 0x10 * ( ( valA >> ( expL + 0x10 ) ) != 0 );
249 	expL += 0x08 * ( ( valA >> ( expL + 0x08 ) ) != 0 );
250 	expL += 0x04 * ( ( valA >> ( expL + 0x04 ) ) != 0 );
251 	expL += 0x02 * ( ( valA >> ( expL + 0x02 ) ) != 0 );
252 	expL += 0x01 * ( ( valA >> ( expL + 0x01 ) ) != 0 );
253 	return expL;
254 }
255 
256 /* ------------------------------------------------------------------------- */
257 
258 const uint32 bbs_pow2M1_tableG[] =
259 {
260 	0,			713,	46769127,	721,	94047537,	729,	141840775,	737,
261 	190154447,	745,	238994221,	753,	288365825,	761,	338275050,	769,
262 	388727751,	778,	439729846,	786,	491287318,	795,	543406214,	803,
263 	596092647,	812,	649352798,	821,	703192913,	830,	757619310,	839,
264 	812638371,	848,	868256550,	857,	924480371,	867,	981316430,	876,
265 	1038771393, 886,	1096851999, 895,	1155565062, 905,	1214917468, 915,
266 	1274916179, 925,	1335568233, 935,	1396880745, 945,	1458860907, 956,
267 	1521515988, 966,	1584853338, 976,	1648880387, 987,	1713604645, 998,
268 	1779033703, 1009,	1845175238, 1020,	1912037006, 1031,	1979626852, 1042,
269 	2047952702, 1053,	2117022572, 1065,	2186844564u, 1077,	2257426868u, 1088,
270 	2328777762u, 1100,	2400905617u, 1112,	2473818892u, 1124,	2547526141u, 1136,
271 	2622036010u, 1149,	2697357237u, 1161,	2773498659u, 1174,	2850469207u, 1187,
272 	2928277909u, 1200,	3006933892u, 1213,	3086446383u, 1226,	3166824708u, 1239,
273 	3248078296u, 1253,	3330216677u, 1266,	3413249486u, 1280,	3497186464u, 1294,
274 	3582037455u, 1308,	3667812413u, 1323,	3754521399u, 1337,	3842174584u, 1352,
275 	3930782250u, 1366,	4020354790u, 1381,	4110902710u, 1396,	4202436633u, 1411
276 };
277 
bbs_pow2M1(uint32 valA)278 uint32 bbs_pow2M1( uint32 valA )
279 {
280 	uint32 offsL = ( valA & 0x03FFFFFF ) >> 10;
281 	uint16 indexL = ( ( valA & 0xFC000000 ) >> 26 ) << 1;
282 	return bbs_pow2M1_tableG[ indexL ] + offsL * bbs_pow2M1_tableG[ indexL + 1 ];
283 }
284 
285 /* ------------------------------------------------------------------------- */
286 
bbs_pow2(int32 valA)287 uint32 bbs_pow2( int32 valA )
288 {
289 	int32 shiftL = 16 - ( valA >> 27 );
290 	uint32 offsL  = ( uint32 )( valA << 5 );
291 	if( shiftL == 32 ) return 1;
292 	return ( 1 << ( 32 - shiftL ) ) + ( bbs_pow2M1( offsL ) >> shiftL );
293 }
294 
295 /* ------------------------------------------------------------------------- */
296 
bbs_exp(int32 valA)297 uint32 bbs_exp( int32 valA )
298 {
299 	int32 adjustedL;
300 	int32 shiftL;
301 	int32 offsL;
302 
303 	/* check boundaries to avoid overflow */
304 	if( valA < -1488522236 )
305 	{
306 		return 0;
307 	}
308 	else if( valA > 1488522236 )
309 	{
310 		return 0xFFFFFFFF;
311 	}
312 
313 	/* multily valA with 1/ln(2) in order to use function 2^x instead of e^x */
314 	adjustedL = ( valA >> 16 ) * 94548 + ( ( ( ( ( uint32 )valA ) & 0x0FFFF ) * 47274 ) >> 15 );
315 
316 	shiftL = 16 - ( adjustedL >> 27 );
317 	if( shiftL == 32 ) return 1;
318 	offsL  = ( uint32 )( adjustedL << 5 );
319 	return ( ( int32 ) 1 << ( 32 - shiftL ) ) + ( bbs_pow2M1( offsL ) >> shiftL );
320 }
321 
322 /* ------------------------------------------------------------------------- */
323 
bbs_satS16(int32 valA)324 int16 bbs_satS16( int32 valA )
325 {
326 	if( valA > 32767 ) return 32767;
327 	if( valA < -32768 ) return -32768;
328 	return valA;
329 }
330 
331 /* ------------------------------------------------------------------------- */
332 
333 #if defined( HW_i586 ) || defined( HW_i686 )
334 
335 /* Windows section */
336 #if defined( WIN32 ) && !defined( WIN64 )
337 
338 /* disable warning "no return value"*/
339 #pragma warning( disable : 4035 )
340 
341 /**
342  * computes a fast dot product using intel MMX, sizeA must be multiple of 16 and >0
343  */
bbs_dotProduct_intelMMX16(const int16 * vec1A,const int16 * vec2A,uint32 sizeA)344 int32 bbs_dotProduct_intelMMX16( const int16* vec1A, const int16* vec2A, uint32 sizeA )
345 {
346 	__asm
347 	{
348 			push    esi
349 			push    edi
350 
351 			mov     eax, vec1A
352 			mov     ebx, vec2A
353 
354 			mov     ecx, sizeA
355 
356 			pxor    mm4, mm4
357 			pxor    mm6, mm6
358 
359 			pxor    mm7, mm7
360 			shr		ecx, 4
361 
362 		inner_loop_start:
363 			movq    mm0, 0[eax]
364 			paddd   mm7, mm4
365 
366 			movq    mm1, 0[ebx]
367 			paddd   mm7, mm6
368 
369 			movq    mm2, 8[eax]
370 			pmaddwd mm0, mm1
371 
372 			movq    mm3, 8[ebx]
373 
374 			movq    mm4, 16[eax]
375 			pmaddwd mm2, mm3
376 
377 			movq    mm5, 16[ebx]
378 			paddd   mm7, mm0
379 
380 			movq    mm6, 24[eax]
381 			pmaddwd mm4, mm5
382 
383 			pmaddwd mm6, 24[ebx]
384 			paddd   mm7, mm2
385 
386 			add     eax, 32
387 			add     ebx, 32
388 
389 			dec     ecx
390 			jnz     inner_loop_start
391 
392 			paddd   mm7, mm4
393 
394 			paddd   mm7, mm6
395 
396 			movq    mm0, mm7
397 
398 			psrlq   mm0, 32
399 
400 			paddd   mm7, mm0
401 
402 			movd    eax, mm7
403 
404 			emms
405 			pop     edi
406 			pop     esi
407 	}
408 }
409 
410 #pragma warning( default : 4035 )
411 
412 /* gcc compiler section */
413 #elif defined( epl_LINUX ) || defined( CYGWIN )
414 
415 /**
416  * computes a fast dot product using intel MMX, sizeA must be multiple of 16 and >0
417  */
bbs_dotProduct_intelMMX16(const int16 * vec1A,const int16 * vec2A,uint32 sizeA)418 int32 bbs_dotProduct_intelMMX16( const int16* vec1A, const int16* vec2A, uint32 sizeA )
419 {
420 	int32 resultL;
421 
422 	__asm__ __volatile__(
423 
424 			"movl %1,%%eax\n\t"
425 			"movl %2,%%ebx\n\t"
426 
427 			"movl %3,%%ecx\n\t"
428 
429 			"pxor %%mm4,%%mm4\n\t"
430 			"pxor %%mm6,%%mm6\n\t"
431 
432 			"pxor %%mm7, %%mm7\n\t"
433 			"shrl $4, %%ecx\n\t"
434 
435 			"\n1:\t"
436 			"movq 0(%%eax),%%mm0\n\t"
437 			"paddd %%mm4,%%mm7\n\t"
438 
439 			"movq 0( %%ebx ),%%mm1\n\t"
440 			"paddd %%mm6,%%mm7\n\t"
441 
442 			"movq 8( %%eax ),%%mm2\n\t"
443 			"pmaddwd %%mm1,%%mm0\n\t"
444 
445 			"movq 8( %%ebx ),%%mm3\n\t"
446 
447 			"movq 16( %%eax ),%%mm4\n\t"
448 			"pmaddwd %%mm3,%%mm2\n\t"
449 
450 			"movq 16( %%ebx ),%%mm5\n\t"
451 			"paddd %%mm0,%%mm7\n\t"
452 
453 			"movq 24( %%eax ),%%mm6\n\t"
454 			"pmaddwd %%mm5,%%mm4\n\t"
455 
456 			"pmaddwd 24( %%ebx ),%%mm6\n\t"
457 			"paddd %%mm2,%%mm7\n\t"
458 
459 			"addl $32,%%eax\n\t"
460 			"addl $32,%%ebx\n\t"
461 
462 			"decl %%ecx\n\t"
463 			"jnz 1b\n\t"
464 
465 			"paddd %%mm4,%%mm7\n\t"
466 			"paddd %%mm6,%%mm7\n\t"
467 
468 			"movq  %%mm7,%%mm0\n\t"
469 
470 			"psrlq $32,%%mm0\n\t"
471 
472 			"paddd %%mm0,%%mm7\n\t"
473 
474 			"movd %%mm7,%0\n\t"
475 
476 			"emms\n\t"
477 
478 		: "=&g" ( resultL )
479 		: "g" ( vec1A ), "g" ( vec2A ), "g" ( sizeA )
480 		: "si", "di", "ax", "bx", "cx", "st", "memory" );
481 
482 	return resultL;
483 }
484 
485 #endif /* epl_LINUX, CYGWIN */
486 
487 #endif /* HW_i586 || HW_i686 */
488 
489 /* ------------------------------------------------------------------------- */
490 
491 #ifdef HW_TMS320C6x
492 /**
493  * Calls fast assembler version of dotproduct for DSP.
494  * dotProduct_C62x is implemented in file dotprod.asm and expects input vectors
495  * of even length.
496  */
bbs_dotProduct_dsp(const int16 * vec1A,const int16 * vec2A,uint32 sizeA)497 int32 bbs_dotProduct_dsp( const int16* vec1A, const int16* vec2A, uint32 sizeA )
498 {
499 	if( sizeA & 1 )
500 	{
501 		int32 resultL;
502 		resultL = dotProduct_C62x( vec1A, vec2A, sizeA - 1 );
503 		return resultL + ( int32 ) *( vec1A + sizeA - 1 ) * *( vec2A + sizeA - 1 );
504 	}
505 	else
506 	{
507 		return dotProduct_C62x( vec1A, vec2A, sizeA );
508 	}
509 }
510 #endif /* HW_TMS320C6x */
511 
512 /* ------------------------------------------------------------------------- */
513 
514 /* 16 dot product for the PS2/EE processor */
515 /* input vectors MUST be 128 bit aligned ! */
516 
517 #if defined( epl_LINUX ) && defined( HW_EE )
518 
bbs_dotProduct_EE(const int16 * vec1A,const int16 * vec2A,uint32 sizeA)519 int32 bbs_dotProduct_EE( const int16* vec1A, const int16* vec2A, uint32 sizeA )
520 {
521 	int32 resultL = 0,
522 	      iL = sizeA >> 3,
523 	      jL = sizeA - ( iL << 3 );
524 
525 	if( iL > 0 )
526 	{
527 		/* multiply-add elements of input vectors in sets of 8 */
528 		int32 accL[ 4 ], t1L, t2L, t3L;
529 		asm volatile (
530 			"pxor %4, %2, %2\n\t"			/* reset 8 accumulators (LO and HI register) to 0 */
531 			"pmtlo %4\n\t"
532 			"pmthi %4\n\t"
533 
534 			"\n__begin_loop:\t"
535 
536 			"lq %2,0(%0)\n\t"				/* load 8 pairs of int16 */
537 			"lq %3,0(%1)\n\t"
538 
539 			"addi %0,%0,16\n\t"				/* vec1L += 16 */
540 			"addi %1,%1,16\n\t"				/* vec2L += 16 */
541 			"addi %7,%7,-1\n\t"				/* iL-- */
542 
543 			"pmaddh %4,%2,%3\n\t"			/* parallel multiply-add of 8 pairs of int16 */
544 
545 			"bgtzl %7,__begin_loop\n\t"		/* if iL > 0 goto _begin_loop */
546 
547 			"pmflo %2\n\t"					/* parallel add 8 accumulators , store remaining 4 accumulators in tmpL */
548 			"pmfhi %3\n\t"
549 			"paddw %4,%2,%3\n\t"
550 			"sq %4,0(%8)\n\t"
551 		: "=r" ( vec1A ), "=r" ( vec2A ), "=r" ( t1L ), "=r" ( t2L ), "=r" ( t3L )
552 		: "0" ( vec1A ), "1" ( vec2A ), "r" ( iL ), "r" ( accL )
553 		: "memory" );
554 
555 		/* add 4 parallel accumulators */
556 		resultL += accL[ 0 ] + accL[ 1 ] + accL[ 2 ] + accL[ 3 ];
557 	}
558 
559 	/* multiply-add remaining elements of input vectors */
560 	for( ; jL--; ) resultL += ( int32 ) *vec1A++ * *vec2A++;
561 
562 	return resultL;
563 }
564 
565 #endif
566 
567 /* ------------------------------------------------------------------------- */
568 
569 #if defined( HW_ARMv5TE )
570 
571 /* fast 16 dot product for ARM9E cores (DSP extensions).
572  * input vectors must be 32 bit aligned
573  */
bbs_dotProduct_arm9e(const int16 * vec1A,const int16 * vec2A,uint32 sizeA)574 int32 bbs_dotProduct_arm9e( const int16* vec1A, const int16* vec2A, uint32 sizeA )
575 {
576 	int32 accuL = 0;
577 
578 	int32* v1PtrL = ( int32* )vec1A;
579 	int32* v2PtrL = ( int32* )vec2A;
580 
581 	for( ; sizeA >= 4; sizeA -= 4 )
582 	{
583 		__asm {
584 		    smlabb accuL, *v1PtrL, *v2PtrL, accuL;
585 		    smlatt accuL, *v1PtrL, *v2PtrL, accuL;
586 		}
587 		v1PtrL++; v2PtrL++;
588 	    __asm {
589 		    smlabb accuL, *v1PtrL, *v2PtrL, accuL;
590 		    smlatt accuL, *v1PtrL, *v2PtrL, accuL;
591 		}
592 		v1PtrL++; v2PtrL++;
593 	}
594 
595 	vec1A = ( int16* )v1PtrL;
596 	vec2A = ( int16* )v2PtrL;
597 
598 	/* multiply-add remaining elements of input vectors */
599 	for( ; sizeA > 0; sizeA-- ) accuL += ( int32 )*vec1A++ * *vec2A++;
600 
601 	return accuL;
602 }
603 
604 #endif
605 
606 /* ------------------------------------------------------------------------- */
607 
608 /**
609  * Computes a fast dot product using standard C
610  */
bbs_dotProduct_stdc(const int16 * vec1A,const int16 * vec2A,uint32 sizeA)611 int32 bbs_dotProduct_stdc( const int16* vec1A, const int16* vec2A, uint32 sizeA )
612 {
613 	int32 accuL = 0;
614 
615 	for( ; sizeA >= 8; sizeA -= 8 )
616 	{
617 		accuL += ( int32 ) *vec1A * *vec2A;
618 		accuL += ( int32 ) *( vec1A + 1 ) * *( vec2A + 1 );
619 		accuL += ( int32 ) *( vec1A + 2 ) * *( vec2A + 2 );
620 		accuL += ( int32 ) *( vec1A + 3 ) * *( vec2A + 3 );
621 
622 		accuL += ( int32 ) *( vec1A + 4 ) * *( vec2A + 4 );
623 		accuL += ( int32 ) *( vec1A + 5 ) * *( vec2A + 5 );
624 		accuL += ( int32 ) *( vec1A + 6 ) * *( vec2A + 6 );
625 		accuL += ( int32 ) *( vec1A + 7 ) * *( vec2A + 7 );
626 
627 		vec1A += 8;
628 		vec2A += 8;
629 	}
630 
631 	for( ; sizeA; sizeA-- ) accuL += ( int32 ) *vec1A++ * *vec2A++;
632 
633 	return accuL;
634 }
635 
636 /* ------------------------------------------------------------------------- */
637 
bbs_dotProductInt16(const int16 * vec1A,const int16 * vec2A,uint32 sizeA)638 int32 bbs_dotProductInt16( const int16* vec1A, const int16* vec2A, uint32 sizeA )
639 {
640 /* PC */
641 #if ( defined( HW_i586 ) || defined( HW_i686 ) )
642 
643 	#if defined( HW_SSE2 )
644 		uint32 size16L = sizeA & 0xfffffff0;
645 		if( size16L )
646 		{
647 			if( ( (uint32)vec1A & 0xF ) == 0 && ( (uint32)vec2A & 0xF ) == 0 )
648 			{
649 				return bbs_dotProduct_128SSE2( vec1A, vec2A, sizeA );
650 			}
651 			else
652 			{
653 				return bbs_dotProduct_u128SSE2( vec1A, vec2A, sizeA );
654 			}
655 		}
656 	#elif !defined( WIN64 )
657 		/* MMX version (not supported by 64-bit compiler) */
658 		uint32 size16L = sizeA & 0xfffffff0;
659 		if( size16L )
660 		{
661 			if( sizeA == size16L )
662 			{
663 				return bbs_dotProduct_intelMMX16( vec1A, vec2A, size16L );
664 			}
665 			return bbs_dotProduct_intelMMX16( vec1A, vec2A, size16L )
666 					+ bbs_dotProduct_stdc( vec1A + size16L, vec2A + size16L, sizeA - size16L );
667 		} /* if( size16L ) */
668 	#endif
669 
670 	return bbs_dotProduct_stdc( vec1A, vec2A, sizeA );
671 
672 /* Playstation 2 */
673 #elif defined( HW_EE ) && defined( epl_LINUX )
674 
675 	if( ( (uint32)vec1A & 0xF ) == 0 && ( (uint32)vec2A & 0xF ) == 0 )
676 	{
677 		return bbs_dotProduct_EE( vec1A, vec2A, sizeA );
678 	}
679 	return bbs_dotProduct_stdc( vec1A, vec2A, sizeA );
680 
681 /* ARM9E */
682 #elif defined( HW_ARMv5TE )
683 
684 	return bbs_dotProduct_arm9e( vec1A, vec2A, sizeA );
685 
686 /* TI C6000 */
687 #elif defined( HW_TMS320C6x )
688 
689 	return bbs_dotProduct_dsp( vec1A, vec2A, sizeA );
690 
691 #elif defined( HW_FR71 )
692 
693 	uint32 size16L = sizeA & 0xfffffff0;
694 	if( size16L )
695 	{
696 		if( sizeA == size16L )
697 		{
698 			return bbs_dotProduct_fr71( vec1A, vec2A, size16L );
699 		}
700 		return bbs_dotProduct_fr71( vec1A, vec2A, size16L )
701 				+ bbs_dotProduct_stdc( vec1A + size16L, vec2A + size16L, sizeA - size16L );
702 	}
703 
704 	return bbs_dotProduct_stdc( vec1A, vec2A, sizeA );
705 
706 #endif
707 
708 	return bbs_dotProduct_stdc( vec1A, vec2A, sizeA );
709 }
710 
711 /* ------------------------------------------------------------------------- */
712 
713 /* table of fermi and slope values (result: 2.30; offset: .12)
714    referenced in b_NeuralNetEm/FastMlpNet.c, not not rename or remove */
715 const uint32 bbs_fermi_tableG[] =
716 {
717 	45056,      8,     77824,      13,    131072,     21,    217088,     34,
718 	356352,     57,    589824,     94,    974848,     155,   1609728,    255,
719 	2654208,    418,   4366336,    688,   7184384,    1126,  11796480,   1834,
720 	19308544,   2970,  31473664,   4748,  50921472,   7453,  81448960,   11363,
721 	127991808,  16573, 195874816,  22680, 288772096,  28469, 405381120,  32102,
722 	536870912,  32101, 668356608,  28469, 784965632,  22680, 877862912,  16573,
723 	945745920,  11363, 992288768,  7453,  1022816256, 4748,  1042264064, 2970,
724 	1054429184, 1834,  1061941248, 1126,  1066553344, 688,   1069371392, 418,
725 	1071083520, 255,   1072128000, 155,   1072762880, 94,    1073147904, 57,
726 	1073381376, 34,    1073520640, 21,    1073606656, 13,    1073659904, 8,
727 };
728 
bbs_fermi(int32 valA)729 int32 bbs_fermi( int32 valA )
730 {
731 	uint32 indexL = ( ( valA >> 15 ) + 20 ) << 1;
732 	uint32 offsL  = ( ( valA & 0x00007FFF ) + 4 ) >> 3;
733 	if( valA <  -655360 ) return 1;
734 	if( valA >=  655360 ) return 1073741824 - 1; /* ( 1 << 30 ) */
735 	return ( bbs_fermi_tableG[ indexL ] + offsL * bbs_fermi_tableG[ indexL + 1 ] );
736 }
737 
738 /* ------------------------------------------------------------------------- */
739 
bbs_uint32ReduceToNBits(uint32 * argPtrA,int32 * bbpPtrA,uint32 nBitsA)740 void bbs_uint32ReduceToNBits( uint32* argPtrA, int32* bbpPtrA, uint32 nBitsA )
741 {
742 	int32 posHighestBitL = bbs_intLog2( *argPtrA ) + 1;
743 	int32 shiftL = posHighestBitL - nBitsA;
744 	if( shiftL > 0 )
745 	{
746 		( *argPtrA ) >>= shiftL;
747 		( *bbpPtrA ) -= shiftL;
748 	}
749 }
750 
751 /* ------------------------------------------------------------------------- */
752 
bbs_int32ReduceToNBits(int32 * argPtrA,int32 * bbpPtrA,uint32 nBitsA)753 void bbs_int32ReduceToNBits( int32* argPtrA, int32* bbpPtrA, uint32 nBitsA )
754 {
755 	int32 posHighestBitL = bbs_intLog2( bbs_abs( *argPtrA ) ) + 1;
756 	int32 shiftL = posHighestBitL - nBitsA;
757 	if( shiftL > 0 )
758 	{
759 		( *argPtrA ) >>= shiftL;
760 		( *bbpPtrA ) -= shiftL;
761 	}
762 }
763 
764 /* ------------------------------------------------------------------------- */
765 
bbs_convertU32(uint32 srcA,int32 srcBbpA,int32 dstBbpA)766 uint32 bbs_convertU32( uint32 srcA, int32 srcBbpA, int32 dstBbpA )
767 {
768 	if( dstBbpA >= srcBbpA )
769 	{
770 		uint32 shiftL = dstBbpA - srcBbpA;
771 		if( srcA > ( ( uint32 )0xFFFFFFFF >> shiftL ) )
772 		{
773 			/* overflow */
774 			return ( uint32 )0xFFFFFFFF;
775 		}
776 		else
777 		{
778 			return srcA << shiftL;
779 		}
780 	}
781 	else
782 	{
783 		uint32 shiftL = srcBbpA - dstBbpA;
784 		uint32 addL = 1L << ( shiftL - 1 );
785 		if( srcA + addL < addL )
786 		{
787 			/* rounding would cause overflow */
788 			return srcA >> shiftL;
789 		}
790 		else
791 		{
792 			return ( srcA + addL ) >> shiftL;
793 		}
794 	}
795 }
796 
797 /* ------------------------------------------------------------------------- */
798 
bbs_convertS32(int32 srcA,int32 srcBbpA,int32 dstBbpA)799 int32 bbs_convertS32( int32 srcA, int32 srcBbpA, int32 dstBbpA )
800 {
801 	if( dstBbpA >= srcBbpA )
802 	{
803 		uint32 shiftL = ( uint32 )( dstBbpA - srcBbpA );
804 		if( srcA > ( ( int32 )0x7FFFFFFF >> shiftL ) )
805 		{
806 			/* overflow */
807 			return ( uint32 )0x7FFFFFFF;
808 		}
809 		else if( srcA < ( ( int32 )0x80000000 >> shiftL ) )
810 		{
811 			/* underflow */
812 			return ( int32 )0x80000000;
813 		}
814 		else
815 		{
816 			return srcA << shiftL;
817 		}
818 	}
819 	else
820 	{
821 		uint32 shiftL = ( uint32 )( srcBbpA - dstBbpA );
822 		int32 addL = 1L << ( shiftL - 1 );
823 		if( srcA + addL < addL )
824 		{
825 			/* rounding would cause overflow */
826 			return srcA >> shiftL;
827 		}
828 		else
829 		{
830 			return ( srcA + addL ) >> shiftL;
831 		}
832 	}
833 }
834 
835 /* ------------------------------------------------------------------------- */
836 
bbs_vecPowerFlt16(const int16 * xA,int16 nxA)837 int32 bbs_vecPowerFlt16( const int16 *xA, int16 nxA )
838 {
839 /*	#if defined( HW_TMS320C5x )
840 		uint32 rL;
841 		power( ( int16* ) xA, ( int32* ) &rL, nxA );  // does not work properly in DSPLib version 2.20.02
842 		return ( rL >> 1 );
843 	#else*/
844 		/* needs to be optimized */
845 		int32 rL = 0;
846 		for( ; nxA--; )
847 		{
848 			rL += ( int32 ) *xA * *xA;
849 			xA++;
850 		}
851 		return rL;
852 /*	#endif */
853 }
854 
855 /* ------------------------------------------------------------------------- */
856 
bbs_mulU32(uint32 v1A,uint32 v2A,uint32 * manPtrA,int32 * expPtrA)857 void bbs_mulU32( uint32 v1A, uint32 v2A, uint32* manPtrA, int32* expPtrA )
858 {
859 	uint32 log1L = bbs_intLog2( v1A );
860 	uint32 log2L = bbs_intLog2( v2A );
861 
862 	if( log1L + log2L < 32 )
863 	{
864 		*manPtrA = v1A * v2A;
865 		*expPtrA = 0;
866 	}
867 	else
868 	{
869 		uint32 v1L = v1A;
870 		uint32 v2L = v2A;
871 		uint32 exp1L = 0;
872 		uint32 exp2L = 0;
873 		if( log1L > 15 && log2L > 15 )
874 		{
875 			exp1L = log1L - 15;
876 			exp2L = log2L - 15;
877 			v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1;
878 			v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1;
879 		}
880 		else if( log1L > 15 )
881 		{
882 			exp1L = log1L + log2L - 31;
883 			v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1;
884 		}
885 		else
886 		{
887 			exp2L = log1L + log2L - 31;
888 			v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1;
889 		}
890 
891 		*manPtrA = v1L * v2L;
892 		*expPtrA = exp1L + exp2L;
893 	}
894 }
895 
896 /* ------------------------------------------------------------------------- */
897 
bbs_mulS32(int32 v1A,int32 v2A,int32 * manPtrA,int32 * expPtrA)898 void bbs_mulS32( int32 v1A, int32 v2A, int32* manPtrA, int32* expPtrA )
899 {
900 	uint32 log1L = bbs_intLog2( v1A > 0 ? v1A : -v1A );
901 	uint32 log2L = bbs_intLog2( v2A > 0 ? v2A : -v2A );
902 
903 	if( log1L + log2L < 30 )
904 	{
905 		*manPtrA = v1A * v2A;
906 		*expPtrA = 0;
907 	}
908 	else
909 	{
910 		int32 v1L = v1A;
911 		int32 v2L = v2A;
912 		int32 exp1L = 0;
913 		int32 exp2L = 0;
914 		if( log1L > 14 && log2L > 14 )
915 		{
916 			exp1L = log1L - 14;
917 			exp2L = log2L - 14;
918 			v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1;
919 			v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1;
920 		}
921 		else if( log1L > 14 )
922 		{
923 			exp1L = log1L + log2L - 29;
924 			v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1;
925 		}
926 		else
927 		{
928 			exp2L = log1L + log2L - 29;
929 			v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1;
930 		}
931 
932 		*manPtrA = v1L * v2L;
933 		*expPtrA = exp1L + exp2L;
934 	}
935 }
936 
937 /* ------------------------------------------------------------------------- */
938 
bbs_vecSqrNorm32(const int32 * vecA,uint32 sizeA,uint32 * manPtrA,uint32 * expPtrA)939 void bbs_vecSqrNorm32( const int32* vecA, uint32 sizeA, uint32* manPtrA, uint32* expPtrA )
940 {
941 	uint32 sumL = 0;
942 	int32 sumExpL = 0;
943 
944 	uint32 iL;
945 	for( iL = 0; iL < sizeA; iL++ )
946 	{
947 		int32 vL = vecA[ iL ];
948 		int32 logL = bbs_intLog2( vL > 0 ? vL : -vL );
949 		int32 expL = ( logL > 14 ) ? logL - 14 : 0;
950 		uint32 prdL;
951 
952 		if( expL >= 1 )
953 		{
954 			vL = ( ( vL >> ( expL - 1 ) ) + 1 ) >> 1;
955 		}
956 		else
957 		{
958 			vL = vL >> expL;
959 		}
960 
961 		prdL = vL * vL;
962 		expL <<= 1; /* now exponent of product */
963 
964 		if( sumExpL > expL )
965 		{
966 			uint32 shrL = sumExpL - expL;
967 			prdL = ( ( prdL >> ( shrL - 1 ) ) + 1 ) >> 1;
968 		}
969 		else if( expL > sumExpL )
970 		{
971 			uint32 shrL = expL - sumExpL;
972 			sumL = ( ( sumL >> ( shrL - 1 ) ) + 1 ) >> 1;
973 			sumExpL += shrL;
974 		}
975 
976 		sumL += prdL;
977 
978 		if( sumL > 0x80000000 )
979 		{
980 			sumL = ( sumL + 1 ) >> 1;
981 			sumExpL++;
982 		}
983 	}
984 
985 	/* make exponent even */
986 	if( ( sumExpL & 1 ) != 0 )
987 	{
988 		sumL = ( sumL + 1 ) >> 1;
989 		sumExpL++;
990 	}
991 
992 	if( manPtrA != NULL ) *manPtrA = sumL;
993 	if( expPtrA != NULL ) *expPtrA = sumExpL;
994 }
995 
996 /* ------------------------------------------------------------------------- */
997 
bbs_vecSqrNorm16(const int16 * vecA,uint32 sizeA,uint32 * manPtrA,uint32 * expPtrA)998 void bbs_vecSqrNorm16( const int16* vecA, uint32 sizeA, uint32* manPtrA, uint32* expPtrA )
999 {
1000 	uint32 sumL = 0;
1001 	int32 sumExpL = 0;
1002 
1003 	uint32 iL;
1004 	for( iL = 0; iL < sizeA; iL++ )
1005 	{
1006 		int32 vL = vecA[ iL ];
1007 		uint32 prdL = vL * vL;
1008 
1009 		if( sumExpL > 0 )
1010 		{
1011 			uint32 shrL = sumExpL;
1012 			prdL = ( ( prdL >> ( shrL - 1 ) ) + 1 ) >> 1;
1013 		}
1014 
1015 		sumL += prdL;
1016 
1017 		if( sumL > 0x80000000 )
1018 		{
1019 			sumL = ( sumL + 1 ) >> 1;
1020 			sumExpL++;
1021 		}
1022 	}
1023 
1024 	/* make exponent even */
1025 	if( ( sumExpL & 1 ) != 0 )
1026 	{
1027 		sumL = ( sumL + 1 ) >> 1;
1028 		sumExpL++;
1029 	}
1030 
1031 	if( manPtrA != NULL ) *manPtrA = sumL;
1032 	if( expPtrA != NULL ) *expPtrA = sumExpL;
1033 }
1034 
1035 /* ------------------------------------------------------------------------- */
1036 
bbs_vecNorm16(const int16 * vecA,uint32 sizeA)1037 uint32 bbs_vecNorm16( const int16* vecA, uint32 sizeA )
1038 {
1039 	uint32 manL;
1040 	uint32 expL;
1041 	bbs_vecSqrNorm16( vecA, sizeA, &manL, &expL );
1042 	manL = bbs_sqrt32( manL );
1043 	return manL << ( expL >> 1 );
1044 }
1045 
1046 /* ------------------------------------------------------------------------- */
1047 
bbs_matMultiplyFlt16(const int16 * x1A,int16 row1A,int16 col1A,const int16 * x2A,int16 col2A,int16 * rA)1048 void bbs_matMultiplyFlt16( const int16 *x1A, int16 row1A, int16 col1A, const int16 *x2A, int16 col2A, int16 *rA )
1049 {
1050 	#if defined( HW_TMS320C5x )
1051 		/* operands need to be in internal memory for mmul*/
1052 		if( x1A > ( int16* ) bbs_C5X_INTERNAL_MEMORY_SIZE ||
1053 			x2A > ( int16* ) bbs_C5X_INTERNAL_MEMORY_SIZE )
1054 		{
1055 			int16 iL,jL,kL;
1056 			int16 *ptr1L, *ptr2L;
1057 			int32 sumL;
1058 
1059 			for( iL = 0; iL < row1A; iL++ )
1060 			{
1061 				for( jL = 0; jL < col2A; jL++ )
1062 				{
1063 					ptr1L = ( int16* ) x1A + iL * col1A;
1064 					ptr2L = ( int16* ) x2A + jL;
1065 					sumL = 0;
1066 					for( kL = 0; kL < col1A; kL++ )
1067 					{
1068 						sumL += ( ( int32 ) *ptr1L++ * *ptr2L );
1069 						ptr2L += col2A;
1070 					}
1071 					*rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */
1072 				}
1073 			}
1074 		}
1075 		else mmul( ( int16* ) x1A, row1A, col1A, ( int16* ) x2A, col1A, col2A, rA );
1076 
1077 	#elif defined( HW_ARMv4 ) || defined( HW_ARMv5TE )
1078 
1079 		int32 iL, jL, kL;
1080 		int16 *ptr1L, *ptr2L;
1081 		int32 sumL;
1082 		for( iL = 0; iL < row1A; iL++ )
1083 		{
1084 			for( jL = 0; jL < col2A; jL++ )
1085 			{
1086 				ptr1L = ( int16* ) x1A + iL * col1A;
1087 				ptr2L = ( int16* ) x2A + jL;
1088 				sumL = 0;
1089 				for( kL = col1A; kL >= 4; kL -= 4 )
1090 				{
1091 					sumL += ( ( int32 ) *ptr1L++ * *ptr2L );
1092 					sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) );
1093 					sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) );
1094 					sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) );
1095 					ptr2L += col2A;
1096 				}
1097 				for( ; kL > 0; kL-- )
1098 				{
1099 					sumL += ( ( int32 ) *ptr1L++ * *ptr2L );
1100 					ptr2L += col2A;
1101 				}
1102 				*rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */
1103 			}
1104 		}
1105 	#else
1106 		/* needs to be optimized */
1107 		int16 iL,jL,kL;
1108 		int16 *ptr1L, *ptr2L;
1109 		int32 sumL;
1110 
1111 		for( iL = 0; iL < row1A; iL++ )
1112 		{
1113 			for( jL = 0; jL < col2A; jL++ )
1114 			{
1115 				ptr1L = ( int16* ) x1A + iL * col1A;
1116 				ptr2L = ( int16* ) x2A + jL;
1117 				sumL = 0;
1118 				for( kL = 0; kL < col1A; kL++ )
1119 				{
1120 					sumL += ( ( int32 ) *ptr1L++ * *ptr2L );
1121 					ptr2L += col2A;
1122 				}
1123 				*rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */
1124 			}
1125 		}
1126 	#endif
1127 }
1128 
1129 /* ------------------------------------------------------------------------- */
1130 
bbs_matMultiplyTranspFlt16(const int16 * x1A,int16 row1A,int16 col1A,const int16 * x2A,int16 col2A,int16 * rA)1131 void bbs_matMultiplyTranspFlt16( const int16 *x1A, int16 row1A, int16 col1A,
1132 								 const int16 *x2A, int16 col2A, int16 *rA )
1133 {
1134 	const int16* ptr1L = x1A;
1135 
1136 	int32 iL;
1137 	for( iL = row1A; iL > 0 ; iL-- )
1138 	{
1139 		int32 jL;
1140 		const int16* ptr2L = x2A;
1141 		for( jL = col2A; jL > 0 ; jL-- )
1142 		{
1143 			int32 kL;
1144 			int32 sumL = 0;
1145 			for( kL = col1A >> 2; kL > 0; kL-- )
1146 			{
1147 				sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
1148 				sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
1149 				sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
1150 				sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
1151 			}
1152 			for( kL = col1A & 3; kL > 0; kL-- )
1153 			{
1154 				sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
1155 			}
1156 			*rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */
1157 			ptr1L -= col1A;
1158 		}
1159 		ptr1L += col1A;
1160 	}
1161 }
1162 
1163 /* ------------------------------------------------------------------------- */
1164 
1165 
1166 #ifndef mtrans
bbs_matTrans(int16 * xA,int16 rowA,int16 colA,int16 * rA)1167 uint16 bbs_matTrans( int16 *xA, int16 rowA, int16 colA, int16 *rA )
1168 {
1169 	/* needs to be optimized */
1170 	int16 iL;
1171 	for( iL = colA; iL--; )
1172 	{
1173 		int16* sL = xA++;
1174 		int16 jL;
1175 		for( jL = rowA; jL--; )
1176 		{
1177 			*rA++ = *sL;
1178 			sL += colA;
1179 		}
1180 	}
1181 	return 0;
1182 }
1183 #endif
1184 
1185 /* ------------------------------------------------------------------------- */
1186 #ifndef atan2_16
bbs_atan2(int16 nomA,int16 denomA)1187 int16 bbs_atan2( int16 nomA, int16 denomA )
1188 {
1189 	int16 phL, argL;
1190 
1191 	if( nomA == denomA ) return 8192;
1192 	argL = ( ( int32 ) nomA << 15 ) / denomA;
1193 
1194 	/* 0.318253*2 x      20857 .15
1195 	  +0.003314*2 x^2      217 .15
1196 	  -0.130908*2 x^3    -8580 .15
1197 	  +0.068542*2 x^4     4491 .15
1198 	  -0.009159*2 x^5     -600 .15 */
1199 
1200 	phL = -600;
1201 	phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 4481;
1202 	phL = ( ( ( int32 ) phL * argL ) >> 15 ) - 8580;
1203 	phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 217;
1204 	phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 20857;
1205 	phL = ( ( int32 ) phL * argL ) >> 15;
1206 
1207 	return phL >> 1; /* /2 */
1208 }
1209 
1210 /* needs to be optimized */
bbs_vecPhase(int16 * reA,int16 * imA,int16 * phaseA,uint16 sizeA)1211 uint16 bbs_vecPhase( int16 *reA, int16 *imA, int16 *phaseA, uint16 sizeA )
1212 {
1213 	for( ; sizeA--; )
1214 	{
1215 		int16 reL = *reA++;
1216 		int16 imL = *imA++;
1217 		int16 phL = 0;
1218 
1219 		if( reL < 0 )
1220 		{
1221 			reL = -reL;
1222 			if( imL < 0 )
1223 			{
1224 				imL = -imL;
1225 				if( reL > imL )
1226 				{
1227 					phL = -32768 + bbs_atan2( imL, reL );
1228 				}
1229 				else
1230 				{
1231 					phL = -16384 - bbs_atan2( reL, imL );
1232 				}
1233 			}
1234 			else
1235 			{
1236 				if( reL > imL )
1237 				{
1238 					phL = -( -32768 + bbs_atan2( imL, reL ) );
1239 				}
1240 				else
1241 				{
1242 					if( imL == 0 ) phL = 0;
1243 					else phL = 16384 + bbs_atan2( reL, imL );
1244 				}
1245 			}
1246 		}
1247 		else
1248 		{
1249 			if( imL < 0 )
1250 			{
1251 				imL = -imL;
1252 				if( reL > imL )
1253 				{
1254 					phL = -bbs_atan2( imL, reL );
1255 				}
1256 				else
1257 				{
1258 					phL = -16384 + bbs_atan2( reL, imL );
1259 				}
1260 			}
1261 			else
1262 			{
1263 				if( reL > imL )
1264 				{
1265 					phL = bbs_atan2( imL, reL );
1266 				}
1267 				else
1268 				{
1269 					if( imL == 0 ) phL = 0;
1270 					else phL = 16384 - bbs_atan2( reL, imL );
1271 				}
1272 			}
1273 		}
1274 
1275 		*phaseA++ = phL;
1276 	}
1277 	return 0;
1278 }
1279 
1280 #endif
1281 
1282 /* ------------------------------------------------------------------------- */
1283