1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/* ---- includes ----------------------------------------------------------- */
18
19#include "b_BasicEm/Math.h"
20#include "b_BasicEm/Functions.h"
21
22/* ---- related objects  --------------------------------------------------- */
23
24/* ---- typedefs ----------------------------------------------------------- */
25
26/* ---- constants ---------------------------------------------------------- */
27
28/* ------------------------------------------------------------------------- */
29
30/* ========================================================================= */
31/*                                                                           */
32/* ---- \ghd{ external functions } ----------------------------------------- */
33/*                                                                           */
34/* ========================================================================= */
35
36#if defined( HW_SSE2 )
37	extern int32 bbs_dotProduct_128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA );
38	extern int32 bbs_dotProduct_u128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA );
39#endif
40
41#if defined( HW_FR71 )
42	int32 bbs_dotProduct_fr71( const int16* vec1A, const int16* vec2A, uint32 sizeA );
43#endif
44
45/* ------------------------------------------------------------------------- */
46
47uint16 bbs_sqrt32( uint32 valA )
48{
49	uint32 rootL = 0;
50	uint32 expL = 0;
51	expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4;
52	expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3;
53	expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2;
54	expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1;
55	switch( expL >> 1 )
56	{
57		case 15: rootL += ( ( rootL + 0x8000 ) * ( rootL + 0x8000 ) <= valA ) << 15;
58		case 14: rootL += ( ( rootL + 0x4000 ) * ( rootL + 0x4000 ) <= valA ) << 14;
59		case 13: rootL += ( ( rootL + 0x2000 ) * ( rootL + 0x2000 ) <= valA ) << 13;
60		case 12: rootL += ( ( rootL + 0x1000 ) * ( rootL + 0x1000 ) <= valA ) << 12;
61		case 11: rootL += ( ( rootL + 0x0800 ) * ( rootL + 0x0800 ) <= valA ) << 11;
62		case 10: rootL += ( ( rootL + 0x0400 ) * ( rootL + 0x0400 ) <= valA ) << 10;
63		case 9:  rootL += ( ( rootL + 0x0200 ) * ( rootL + 0x0200 ) <= valA ) << 9;
64		case 8:  rootL += ( ( rootL + 0x0100 ) * ( rootL + 0x0100 ) <= valA ) << 8;
65		case 7:  rootL += ( ( rootL + 0x0080 ) * ( rootL + 0x0080 ) <= valA ) << 7;
66		case 6:  rootL += ( ( rootL + 0x0040 ) * ( rootL + 0x0040 ) <= valA ) << 6;
67		case 5:  rootL += ( ( rootL + 0x0020 ) * ( rootL + 0x0020 ) <= valA ) << 5;
68		case 4:  rootL += ( ( rootL + 0x0010 ) * ( rootL + 0x0010 ) <= valA ) << 4;
69		case 3:  rootL += ( ( rootL + 0x0008 ) * ( rootL + 0x0008 ) <= valA ) << 3;
70		case 2:  rootL += ( ( rootL + 0x0004 ) * ( rootL + 0x0004 ) <= valA ) << 2;
71		case 1:  rootL += ( ( rootL + 0x0002 ) * ( rootL + 0x0002 ) <= valA ) << 1;
72		case 0:  rootL += ( ( rootL + 0x0001 ) * ( rootL + 0x0001 ) <= valA ) << 0;
73	}
74
75	return ( uint16 )rootL;
76}
77
78/* ------------------------------------------------------------------------- */
79
80uint8 bbs_sqrt16( uint16 valA )
81{
82	uint16 rootL = 0;
83	rootL += ( ( rootL + 0x80 ) * ( rootL + 0x80 ) <= valA ) << 7;
84	rootL += ( ( rootL + 0x40 ) * ( rootL + 0x40 ) <= valA ) << 6;
85	rootL += ( ( rootL + 0x20 ) * ( rootL + 0x20 ) <= valA ) << 5;
86	rootL += ( ( rootL + 0x10 ) * ( rootL + 0x10 ) <= valA ) << 4;
87	rootL += ( ( rootL + 0x08 ) * ( rootL + 0x08 ) <= valA ) << 3;
88	rootL += ( ( rootL + 0x04 ) * ( rootL + 0x04 ) <= valA ) << 2;
89	rootL += ( ( rootL + 0x02 ) * ( rootL + 0x02 ) <= valA ) << 1;
90	rootL += ( ( rootL + 0x01 ) * ( rootL + 0x01 ) <= valA ) << 0;
91	return ( uint8 )rootL;
92}
93
94/* ------------------------------------------------------------------------- */
95
96/* table of sqrt and slope values */
97const uint32 bbs_fastSqrt32_tableG[] =
98{
99	268435456, 1016, 272596992, 1000, 276692992, 987, 280735744, 972,
100	284717056, 959, 288645120, 946, 292519936, 933, 296341504, 922,
101	300118016, 910, 303845376, 899, 307527680, 889, 311169024, 878,
102	314765312, 869, 318324736, 858, 321839104, 850, 325320704, 840,
103	328761344, 832, 332169216, 824, 335544320, 815, 338882560, 807,
104	342188032, 799, 345460736, 792, 348704768, 785, 351920128, 777,
105	355102720, 771, 358260736, 764, 361390080, 757, 364490752, 751,
106	367566848, 745, 370618368, 739, 373645312, 732, 376643584, 727,
107	379621376, 722, 382578688, 715, 385507328, 711, 388419584, 705,
108	391307264, 700, 394174464, 695, 397021184, 689, 399843328, 686,
109	402653184, 680, 405438464, 675, 408203264, 672, 410955776, 666,
110	413683712, 663, 416399360, 658, 419094528, 653, 421769216, 650,
111	424431616, 646, 427077632, 641, 429703168, 638, 432316416, 634,
112	434913280, 630, 437493760, 627, 440061952, 622, 442609664, 620,
113	445149184, 615, 447668224, 613, 450179072, 609, 452673536, 605,
114	455151616, 602, 457617408, 600, 460075008, 595, 462512128, 593,
115	464941056, 590, 467357696, 587, 469762048, 583, 472150016, 581,
116	474529792, 578, 476897280, 575, 479252480, 572, 481595392, 569,
117	483926016, 567, 486248448, 564, 488558592, 561, 490856448, 559,
118	493146112, 556, 495423488, 553, 497688576, 552, 499949568, 548,
119	502194176, 546, 504430592, 544, 506658816, 541, 508874752, 539,
120	511082496, 537, 513282048, 534, 515469312, 533, 517652480, 529,
121	519819264, 528, 521981952, 526, 524136448, 523, 526278656, 521,
122	528412672, 519, 530538496, 517, 532656128, 515, 534765568, 514
123};
124
125uint16 bbs_fastSqrt32( uint32 valA )
126{
127	uint32 expL = 0;
128	uint32 valL;
129	uint32 offsL;
130	uint32 indexL = 0;
131
132	if( valA == 0 ) return 0;
133
134	/* compute closest even size exponent of valA */
135	expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4;
136	expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3;
137	expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2;
138	expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1;
139
140	valL = ( ( valA << ( 30 - expL ) ) - 1073741824 ); /* ( 1 << 30 ) */
141	offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 12 ) ) >> 13;
142	indexL = ( valL >> 24 ) & 0xFE;
143
144	return ( bbs_fastSqrt32_tableG[ indexL ] + offsL * bbs_fastSqrt32_tableG[ indexL + 1 ] ) >> ( 28 - ( expL >> 1 ) );
145}
146
147/* ------------------------------------------------------------------------- */
148
149/* table of 1/sqrt (1.31) and negative slope (.15) values
150   referenced in b_GaborCueEm/focusDispAsm.s55, do not rename or remove ! */
151const uint32 bbs_invSqrt32_tableG[] =
152{
153	2147483648u, 1001, 2114682880, 956, 2083356672, 915, 2053373952, 877,
154	2024636416, 840, 1997111296, 808, 1970634752, 776, 1945206784, 746,
155	1920761856, 720, 1897168896, 693, 1874460672, 669, 1852538880, 646,
156	1831370752, 625, 1810890752, 604, 1791098880, 584, 1771962368, 567,
157	1753382912, 548, 1735426048, 533, 1717960704, 516, 1701052416, 502,
158	1684602880, 487, 1668644864, 474, 1653112832, 461, 1638006784, 448,
159	1623326720, 436, 1609039872, 426, 1595080704, 414, 1581514752, 404,
160	1568276480, 394, 1555365888, 384, 1542782976, 375, 1530494976, 367,
161	1518469120, 357, 1506770944, 350, 1495302144, 342, 1484095488, 334,
162	1473150976, 327, 1462435840, 320, 1451950080, 313, 1441693696, 307,
163	1431633920, 300, 1421803520, 294, 1412169728, 289, 1402699776, 282,
164	1393459200, 277, 1384382464, 272, 1375469568, 266, 1366753280, 262,
165	1358168064, 257, 1349746688, 251, 1341521920, 248, 1333395456, 243,
166	1325432832, 238, 1317634048, 235, 1309933568, 230, 1302396928, 227,
167	1294958592, 222, 1287684096, 219, 1280507904, 216, 1273430016, 211,
168	1266515968, 209, 1259667456, 205, 1252950016, 202, 1246330880, 198,
169	1239842816, 196, 1233420288, 192, 1227128832, 190, 1220902912, 187,
170	1214775296, 184, 1208745984, 181, 1202814976, 179, 1196949504, 176,
171	1191182336, 173, 1185513472, 171, 1179910144, 169, 1174372352, 166,
172	1168932864, 164, 1163558912, 162, 1158250496, 160, 1153007616, 157,
173	1147863040, 155, 1142784000, 154, 1137737728, 151, 1132789760, 149,
174	1127907328, 148, 1123057664, 145, 1118306304, 144, 1113587712, 142,
175	1108934656, 140, 1104347136, 138, 1099825152, 137, 1095335936, 135,
176	1090912256, 134, 1086521344, 131, 1082228736, 131, 1077936128, 128
177};
178
179uint32 bbs_invSqrt32( uint32 valA )
180{
181
182	uint32 expL = 0;
183	uint32 valL;
184	uint32 offsL;
185	uint32 indexL = 0;
186
187	if( valA == 0U ) return 0x80000000;
188
189	/* compute closest even size exponent of valA */
190	expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4;
191	expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3;
192	expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2;
193	expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1;
194
195	valL = ( ( valA << ( 30 - expL ) ) - 1073741824 ); /* ( 1 << 30 ) */
196	offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 9 ) ) >> 10;
197	indexL = ( valL >> 24 ) & 0xFE;
198
199	return ( bbs_invSqrt32_tableG[ indexL ] - offsL * bbs_invSqrt32_tableG[ indexL + 1 ] ) >> ( expL >> 1 );
200}
201
202/* ------------------------------------------------------------------------- */
203
204/* table of 1/( x + 1 ) (2.30) and negative slope (.14) values
205   referenced in b_GaborCueEm/focusDispAsm.s55, do not rename or remove ! */
206const int32 bbs_inv32_tableG[] =
207{
208	1073741824, 1986, 1041203200, 1870, 1010565120, 1762, 981696512, 1664,
209	954433536,  1575, 928628736,  1491, 904200192,  1415, 881016832, 1345,
210	858980352,  1278, 838041600,  1218, 818085888,  1162, 799047680, 1108,
211	780894208,  1059, 763543552,  1013, 746946560,  970,  731054080, 930,
212	715816960,  891,  701218816,  856,  687194112,  823,  673710080, 791,
213	660750336,  761,  648282112,  732,  636289024,  706,  624721920, 681,
214	613564416,  657,  602800128,  635,  592396288,  613,  582352896, 592,
215	572653568,  573,  563265536,  554,  554188800,  537,  545390592, 520,
216};
217
218int32 bbs_inv32( int32 valA )
219{
220
221	uint32 expL = 0;
222	int32 signL = ( ( valA >> 30 ) & 0xFFFFFFFE ) + 1;
223	int32 valL = signL * valA;
224	int32 offsL;
225	uint32 indexL = 0;
226
227	if( valL <= ( int32 ) 1 ) return 0x40000000 * signL;
228
229	/* compute size exponent of valL */
230	expL += ( ( valL >> ( expL + 0x10 ) ) != 0 ) << 4;
231	expL += ( ( valL >> ( expL + 0x08 ) ) != 0 ) << 3;
232	expL += ( ( valL >> ( expL + 0x04 ) ) != 0 ) << 2;
233	expL += ( ( valL >> ( expL + 0x02 ) ) != 0 ) << 1;
234	expL += ( ( valL >> ( expL + 0x01 ) ) != 0 );
235
236	valL = ( ( valL << ( 30 - expL ) ) - 1073741824 ); /*( 1U << 30 )*/
237	offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 10 ) ) >> 11;
238	indexL = ( valL >> 24 ) & 0xFE;
239
240	return signL * ( ( ( ( bbs_inv32_tableG[ indexL ] - offsL * bbs_inv32_tableG[ indexL + 1 ] ) >> ( expL - 1 ) ) + 1 ) >> 1 );
241}
242
243/* ------------------------------------------------------------------------- */
244
245uint32 bbs_intLog2( uint32 valA )
246{
247	uint32 expL = 0;
248	expL += 0x10 * ( ( valA >> ( expL + 0x10 ) ) != 0 );
249	expL += 0x08 * ( ( valA >> ( expL + 0x08 ) ) != 0 );
250	expL += 0x04 * ( ( valA >> ( expL + 0x04 ) ) != 0 );
251	expL += 0x02 * ( ( valA >> ( expL + 0x02 ) ) != 0 );
252	expL += 0x01 * ( ( valA >> ( expL + 0x01 ) ) != 0 );
253	return expL;
254}
255
256/* ------------------------------------------------------------------------- */
257
258const uint32 bbs_pow2M1_tableG[] =
259{
260	0,			713,	46769127,	721,	94047537,	729,	141840775,	737,
261	190154447,	745,	238994221,	753,	288365825,	761,	338275050,	769,
262	388727751,	778,	439729846,	786,	491287318,	795,	543406214,	803,
263	596092647,	812,	649352798,	821,	703192913,	830,	757619310,	839,
264	812638371,	848,	868256550,	857,	924480371,	867,	981316430,	876,
265	1038771393, 886,	1096851999, 895,	1155565062, 905,	1214917468, 915,
266	1274916179, 925,	1335568233, 935,	1396880745, 945,	1458860907, 956,
267	1521515988, 966,	1584853338, 976,	1648880387, 987,	1713604645, 998,
268	1779033703, 1009,	1845175238, 1020,	1912037006, 1031,	1979626852, 1042,
269	2047952702, 1053,	2117022572, 1065,	2186844564u, 1077,	2257426868u, 1088,
270	2328777762u, 1100,	2400905617u, 1112,	2473818892u, 1124,	2547526141u, 1136,
271	2622036010u, 1149,	2697357237u, 1161,	2773498659u, 1174,	2850469207u, 1187,
272	2928277909u, 1200,	3006933892u, 1213,	3086446383u, 1226,	3166824708u, 1239,
273	3248078296u, 1253,	3330216677u, 1266,	3413249486u, 1280,	3497186464u, 1294,
274	3582037455u, 1308,	3667812413u, 1323,	3754521399u, 1337,	3842174584u, 1352,
275	3930782250u, 1366,	4020354790u, 1381,	4110902710u, 1396,	4202436633u, 1411
276};
277
278uint32 bbs_pow2M1( uint32 valA )
279{
280	uint32 offsL = ( valA & 0x03FFFFFF ) >> 10;
281	uint16 indexL = ( ( valA & 0xFC000000 ) >> 26 ) << 1;
282	return bbs_pow2M1_tableG[ indexL ] + offsL * bbs_pow2M1_tableG[ indexL + 1 ];
283}
284
285/* ------------------------------------------------------------------------- */
286
287uint32 bbs_pow2( int32 valA )
288{
289	int32 shiftL = 16 - ( valA >> 27 );
290	uint32 offsL  = ( uint32 )( valA << 5 );
291	if( shiftL == 32 ) return 1;
292	return ( 1 << ( 32 - shiftL ) ) + ( bbs_pow2M1( offsL ) >> shiftL );
293}
294
295/* ------------------------------------------------------------------------- */
296
297uint32 bbs_exp( int32 valA )
298{
299	int32 adjustedL;
300	int32 shiftL;
301	int32 offsL;
302
303	/* check boundaries to avoid overflow */
304	if( valA < -1488522236 )
305	{
306		return 0;
307	}
308	else if( valA > 1488522236 )
309	{
310		return 0xFFFFFFFF;
311	}
312
313	/* multily valA with 1/ln(2) in order to use function 2^x instead of e^x */
314	adjustedL = ( valA >> 16 ) * 94548 + ( ( ( ( ( uint32 )valA ) & 0x0FFFF ) * 47274 ) >> 15 );
315
316	shiftL = 16 - ( adjustedL >> 27 );
317	if( shiftL == 32 ) return 1;
318	offsL  = ( uint32 )( adjustedL << 5 );
319	return ( ( int32 ) 1 << ( 32 - shiftL ) ) + ( bbs_pow2M1( offsL ) >> shiftL );
320}
321
322/* ------------------------------------------------------------------------- */
323
324int16 bbs_satS16( int32 valA )
325{
326	if( valA > 32767 ) return 32767;
327	if( valA < -32768 ) return -32768;
328	return valA;
329}
330
331/* ------------------------------------------------------------------------- */
332
333#if defined( HW_i586 ) || defined( HW_i686 )
334
335/* Windows section */
336#if defined( WIN32 ) && !defined( WIN64 )
337
338/* disable warning "no return value"*/
339#pragma warning( disable : 4035 )
340
341/**
342 * computes a fast dot product using intel MMX, sizeA must be multiple of 16 and >0
343 */
344int32 bbs_dotProduct_intelMMX16( const int16* vec1A, const int16* vec2A, uint32 sizeA )
345{
346	__asm
347	{
348			push    esi
349			push    edi
350
351			mov     eax, vec1A
352			mov     ebx, vec2A
353
354			mov     ecx, sizeA
355
356			pxor    mm4, mm4
357			pxor    mm6, mm6
358
359			pxor    mm7, mm7
360			shr		ecx, 4
361
362		inner_loop_start:
363			movq    mm0, 0[eax]
364			paddd   mm7, mm4
365
366			movq    mm1, 0[ebx]
367			paddd   mm7, mm6
368
369			movq    mm2, 8[eax]
370			pmaddwd mm0, mm1
371
372			movq    mm3, 8[ebx]
373
374			movq    mm4, 16[eax]
375			pmaddwd mm2, mm3
376
377			movq    mm5, 16[ebx]
378			paddd   mm7, mm0
379
380			movq    mm6, 24[eax]
381			pmaddwd mm4, mm5
382
383			pmaddwd mm6, 24[ebx]
384			paddd   mm7, mm2
385
386			add     eax, 32
387			add     ebx, 32
388
389			dec     ecx
390			jnz     inner_loop_start
391
392			paddd   mm7, mm4
393
394			paddd   mm7, mm6
395
396			movq    mm0, mm7
397
398			psrlq   mm0, 32
399
400			paddd   mm7, mm0
401
402			movd    eax, mm7
403
404			emms
405			pop     edi
406			pop     esi
407	}
408}
409
410#pragma warning( default : 4035 )
411
412/* gcc compiler section */
413#elif defined( epl_LINUX ) || defined( CYGWIN )
414
415/**
416 * computes a fast dot product using intel MMX, sizeA must be multiple of 16 and >0
417 */
418int32 bbs_dotProduct_intelMMX16( const int16* vec1A, const int16* vec2A, uint32 sizeA )
419{
420	int32 resultL;
421
422	__asm__ __volatile__(
423
424			"movl %1,%%eax\n\t"
425			"movl %2,%%ebx\n\t"
426
427			"movl %3,%%ecx\n\t"
428
429			"pxor %%mm4,%%mm4\n\t"
430			"pxor %%mm6,%%mm6\n\t"
431
432			"pxor %%mm7, %%mm7\n\t"
433			"shrl $4, %%ecx\n\t"
434
435			"\n1:\t"
436			"movq 0(%%eax),%%mm0\n\t"
437			"paddd %%mm4,%%mm7\n\t"
438
439			"movq 0( %%ebx ),%%mm1\n\t"
440			"paddd %%mm6,%%mm7\n\t"
441
442			"movq 8( %%eax ),%%mm2\n\t"
443			"pmaddwd %%mm1,%%mm0\n\t"
444
445			"movq 8( %%ebx ),%%mm3\n\t"
446
447			"movq 16( %%eax ),%%mm4\n\t"
448			"pmaddwd %%mm3,%%mm2\n\t"
449
450			"movq 16( %%ebx ),%%mm5\n\t"
451			"paddd %%mm0,%%mm7\n\t"
452
453			"movq 24( %%eax ),%%mm6\n\t"
454			"pmaddwd %%mm5,%%mm4\n\t"
455
456			"pmaddwd 24( %%ebx ),%%mm6\n\t"
457			"paddd %%mm2,%%mm7\n\t"
458
459			"addl $32,%%eax\n\t"
460			"addl $32,%%ebx\n\t"
461
462			"decl %%ecx\n\t"
463			"jnz 1b\n\t"
464
465			"paddd %%mm4,%%mm7\n\t"
466			"paddd %%mm6,%%mm7\n\t"
467
468			"movq  %%mm7,%%mm0\n\t"
469
470			"psrlq $32,%%mm0\n\t"
471
472			"paddd %%mm0,%%mm7\n\t"
473
474			"movd %%mm7,%0\n\t"
475
476			"emms\n\t"
477
478		: "=&g" ( resultL )
479		: "g" ( vec1A ), "g" ( vec2A ), "g" ( sizeA )
480		: "si", "di", "ax", "bx", "cx", "st", "memory" );
481
482	return resultL;
483}
484
485#endif /* epl_LINUX, CYGWIN */
486
487#endif /* HW_i586 || HW_i686 */
488
489/* ------------------------------------------------------------------------- */
490
491#ifdef HW_TMS320C6x
492/**
493 * Calls fast assembler version of dotproduct for DSP.
494 * dotProduct_C62x is implemented in file dotprod.asm and expects input vectors
495 * of even length.
496 */
497int32 bbs_dotProduct_dsp( const int16* vec1A, const int16* vec2A, uint32 sizeA )
498{
499	if( sizeA & 1 )
500	{
501		int32 resultL;
502		resultL = dotProduct_C62x( vec1A, vec2A, sizeA - 1 );
503		return resultL + ( int32 ) *( vec1A + sizeA - 1 ) * *( vec2A + sizeA - 1 );
504	}
505	else
506	{
507		return dotProduct_C62x( vec1A, vec2A, sizeA );
508	}
509}
510#endif /* HW_TMS320C6x */
511
512/* ------------------------------------------------------------------------- */
513
514/* 16 dot product for the PS2/EE processor */
515/* input vectors MUST be 128 bit aligned ! */
516
517#if defined( epl_LINUX ) && defined( HW_EE )
518
519int32 bbs_dotProduct_EE( const int16* vec1A, const int16* vec2A, uint32 sizeA )
520{
521	int32 resultL = 0,
522	      iL = sizeA >> 3,
523	      jL = sizeA - ( iL << 3 );
524
525	if( iL > 0 )
526	{
527		/* multiply-add elements of input vectors in sets of 8 */
528		int32 accL[ 4 ], t1L, t2L, t3L;
529		asm volatile (
530			"pxor %4, %2, %2\n\t"			/* reset 8 accumulators (LO and HI register) to 0 */
531			"pmtlo %4\n\t"
532			"pmthi %4\n\t"
533
534			"\n__begin_loop:\t"
535
536			"lq %2,0(%0)\n\t"				/* load 8 pairs of int16 */
537			"lq %3,0(%1)\n\t"
538
539			"addi %0,%0,16\n\t"				/* vec1L += 16 */
540			"addi %1,%1,16\n\t"				/* vec2L += 16 */
541			"addi %7,%7,-1\n\t"				/* iL-- */
542
543			"pmaddh %4,%2,%3\n\t"			/* parallel multiply-add of 8 pairs of int16 */
544
545			"bgtzl %7,__begin_loop\n\t"		/* if iL > 0 goto _begin_loop */
546
547			"pmflo %2\n\t"					/* parallel add 8 accumulators , store remaining 4 accumulators in tmpL */
548			"pmfhi %3\n\t"
549			"paddw %4,%2,%3\n\t"
550			"sq %4,0(%8)\n\t"
551		: "=r" ( vec1A ), "=r" ( vec2A ), "=r" ( t1L ), "=r" ( t2L ), "=r" ( t3L )
552		: "0" ( vec1A ), "1" ( vec2A ), "r" ( iL ), "r" ( accL )
553		: "memory" );
554
555		/* add 4 parallel accumulators */
556		resultL += accL[ 0 ] + accL[ 1 ] + accL[ 2 ] + accL[ 3 ];
557	}
558
559	/* multiply-add remaining elements of input vectors */
560	for( ; jL--; ) resultL += ( int32 ) *vec1A++ * *vec2A++;
561
562	return resultL;
563}
564
565#endif
566
567/* ------------------------------------------------------------------------- */
568
569#if defined( HW_ARMv5TE )
570
571/* fast 16 dot product for ARM9E cores (DSP extensions).
572 * input vectors must be 32 bit aligned
573 */
574int32 bbs_dotProduct_arm9e( const int16* vec1A, const int16* vec2A, uint32 sizeA )
575{
576	int32 accuL = 0;
577
578	int32* v1PtrL = ( int32* )vec1A;
579	int32* v2PtrL = ( int32* )vec2A;
580
581	for( ; sizeA >= 4; sizeA -= 4 )
582	{
583		__asm {
584		    smlabb accuL, *v1PtrL, *v2PtrL, accuL;
585		    smlatt accuL, *v1PtrL, *v2PtrL, accuL;
586		}
587		v1PtrL++; v2PtrL++;
588	    __asm {
589		    smlabb accuL, *v1PtrL, *v2PtrL, accuL;
590		    smlatt accuL, *v1PtrL, *v2PtrL, accuL;
591		}
592		v1PtrL++; v2PtrL++;
593	}
594
595	vec1A = ( int16* )v1PtrL;
596	vec2A = ( int16* )v2PtrL;
597
598	/* multiply-add remaining elements of input vectors */
599	for( ; sizeA > 0; sizeA-- ) accuL += ( int32 )*vec1A++ * *vec2A++;
600
601	return accuL;
602}
603
604#endif
605
606/* ------------------------------------------------------------------------- */
607
608/**
609 * Computes a fast dot product using standard C
610 */
611int32 bbs_dotProduct_stdc( const int16* vec1A, const int16* vec2A, uint32 sizeA )
612{
613	int32 accuL = 0;
614
615	for( ; sizeA >= 8; sizeA -= 8 )
616	{
617		accuL += ( int32 ) *vec1A * *vec2A;
618		accuL += ( int32 ) *( vec1A + 1 ) * *( vec2A + 1 );
619		accuL += ( int32 ) *( vec1A + 2 ) * *( vec2A + 2 );
620		accuL += ( int32 ) *( vec1A + 3 ) * *( vec2A + 3 );
621
622		accuL += ( int32 ) *( vec1A + 4 ) * *( vec2A + 4 );
623		accuL += ( int32 ) *( vec1A + 5 ) * *( vec2A + 5 );
624		accuL += ( int32 ) *( vec1A + 6 ) * *( vec2A + 6 );
625		accuL += ( int32 ) *( vec1A + 7 ) * *( vec2A + 7 );
626
627		vec1A += 8;
628		vec2A += 8;
629	}
630
631	for( ; sizeA; sizeA-- ) accuL += ( int32 ) *vec1A++ * *vec2A++;
632
633	return accuL;
634}
635
636/* ------------------------------------------------------------------------- */
637
638int32 bbs_dotProductInt16( const int16* vec1A, const int16* vec2A, uint32 sizeA )
639{
640/* PC */
641#if ( defined( HW_i586 ) || defined( HW_i686 ) )
642
643	#if defined( HW_SSE2 )
644		uint32 size16L = sizeA & 0xfffffff0;
645		if( size16L )
646		{
647			if( ( (uint32)vec1A & 0xF ) == 0 && ( (uint32)vec2A & 0xF ) == 0 )
648			{
649				return bbs_dotProduct_128SSE2( vec1A, vec2A, sizeA );
650			}
651			else
652			{
653				return bbs_dotProduct_u128SSE2( vec1A, vec2A, sizeA );
654			}
655		}
656	#elif !defined( WIN64 )
657		/* MMX version (not supported by 64-bit compiler) */
658		uint32 size16L = sizeA & 0xfffffff0;
659		if( size16L )
660		{
661			if( sizeA == size16L )
662			{
663				return bbs_dotProduct_intelMMX16( vec1A, vec2A, size16L );
664			}
665			return bbs_dotProduct_intelMMX16( vec1A, vec2A, size16L )
666					+ bbs_dotProduct_stdc( vec1A + size16L, vec2A + size16L, sizeA - size16L );
667		} /* if( size16L ) */
668	#endif
669
670	return bbs_dotProduct_stdc( vec1A, vec2A, sizeA );
671
672/* Playstation 2 */
673#elif defined( HW_EE ) && defined( epl_LINUX )
674
675	if( ( (uint32)vec1A & 0xF ) == 0 && ( (uint32)vec2A & 0xF ) == 0 )
676	{
677		return bbs_dotProduct_EE( vec1A, vec2A, sizeA );
678	}
679	return bbs_dotProduct_stdc( vec1A, vec2A, sizeA );
680
681/* ARM9E */
682#elif defined( HW_ARMv5TE )
683
684	return bbs_dotProduct_arm9e( vec1A, vec2A, sizeA );
685
686/* TI C6000 */
687#elif defined( HW_TMS320C6x )
688
689	return bbs_dotProduct_dsp( vec1A, vec2A, sizeA );
690
691#elif defined( HW_FR71 )
692
693	uint32 size16L = sizeA & 0xfffffff0;
694	if( size16L )
695	{
696		if( sizeA == size16L )
697		{
698			return bbs_dotProduct_fr71( vec1A, vec2A, size16L );
699		}
700		return bbs_dotProduct_fr71( vec1A, vec2A, size16L )
701				+ bbs_dotProduct_stdc( vec1A + size16L, vec2A + size16L, sizeA - size16L );
702	}
703
704	return bbs_dotProduct_stdc( vec1A, vec2A, sizeA );
705
706#endif
707
708	return bbs_dotProduct_stdc( vec1A, vec2A, sizeA );
709}
710
711/* ------------------------------------------------------------------------- */
712
713/* table of fermi and slope values (result: 2.30; offset: .12)
714   referenced in b_NeuralNetEm/FastMlpNet.c, not not rename or remove */
715const uint32 bbs_fermi_tableG[] =
716{
717	45056,      8,     77824,      13,    131072,     21,    217088,     34,
718	356352,     57,    589824,     94,    974848,     155,   1609728,    255,
719	2654208,    418,   4366336,    688,   7184384,    1126,  11796480,   1834,
720	19308544,   2970,  31473664,   4748,  50921472,   7453,  81448960,   11363,
721	127991808,  16573, 195874816,  22680, 288772096,  28469, 405381120,  32102,
722	536870912,  32101, 668356608,  28469, 784965632,  22680, 877862912,  16573,
723	945745920,  11363, 992288768,  7453,  1022816256, 4748,  1042264064, 2970,
724	1054429184, 1834,  1061941248, 1126,  1066553344, 688,   1069371392, 418,
725	1071083520, 255,   1072128000, 155,   1072762880, 94,    1073147904, 57,
726	1073381376, 34,    1073520640, 21,    1073606656, 13,    1073659904, 8,
727};
728
729int32 bbs_fermi( int32 valA )
730{
731	uint32 indexL = ( ( valA >> 15 ) + 20 ) << 1;
732	uint32 offsL  = ( ( valA & 0x00007FFF ) + 4 ) >> 3;
733	if( valA <  -655360 ) return 1;
734	if( valA >=  655360 ) return 1073741824 - 1; /* ( 1 << 30 ) */
735	return ( bbs_fermi_tableG[ indexL ] + offsL * bbs_fermi_tableG[ indexL + 1 ] );
736}
737
738/* ------------------------------------------------------------------------- */
739
740void bbs_uint32ReduceToNBits( uint32* argPtrA, int32* bbpPtrA, uint32 nBitsA )
741{
742	int32 posHighestBitL = bbs_intLog2( *argPtrA ) + 1;
743	int32 shiftL = posHighestBitL - nBitsA;
744	if( shiftL > 0 )
745	{
746		( *argPtrA ) >>= shiftL;
747		( *bbpPtrA ) -= shiftL;
748	}
749}
750
751/* ------------------------------------------------------------------------- */
752
753void bbs_int32ReduceToNBits( int32* argPtrA, int32* bbpPtrA, uint32 nBitsA )
754{
755	int32 posHighestBitL = bbs_intLog2( bbs_abs( *argPtrA ) ) + 1;
756	int32 shiftL = posHighestBitL - nBitsA;
757	if( shiftL > 0 )
758	{
759		( *argPtrA ) >>= shiftL;
760		( *bbpPtrA ) -= shiftL;
761	}
762}
763
764/* ------------------------------------------------------------------------- */
765
766uint32 bbs_convertU32( uint32 srcA, int32 srcBbpA, int32 dstBbpA )
767{
768	if( dstBbpA >= srcBbpA )
769	{
770		uint32 shiftL = dstBbpA - srcBbpA;
771		if( srcA > ( ( uint32 )0xFFFFFFFF >> shiftL ) )
772		{
773			/* overflow */
774			return ( uint32 )0xFFFFFFFF;
775		}
776		else
777		{
778			return srcA << shiftL;
779		}
780	}
781	else
782	{
783		uint32 shiftL = srcBbpA - dstBbpA;
784		uint32 addL = 1L << ( shiftL - 1 );
785		if( srcA + addL < addL )
786		{
787			/* rounding would cause overflow */
788			return srcA >> shiftL;
789		}
790		else
791		{
792			return ( srcA + addL ) >> shiftL;
793		}
794	}
795}
796
797/* ------------------------------------------------------------------------- */
798
799int32 bbs_convertS32( int32 srcA, int32 srcBbpA, int32 dstBbpA )
800{
801	if( dstBbpA >= srcBbpA )
802	{
803		uint32 shiftL = ( uint32 )( dstBbpA - srcBbpA );
804		if( srcA > ( ( int32 )0x7FFFFFFF >> shiftL ) )
805		{
806			/* overflow */
807			return ( uint32 )0x7FFFFFFF;
808		}
809		else if( srcA < ( ( int32 )0x80000000 >> shiftL ) )
810		{
811			/* underflow */
812			return ( int32 )0x80000000;
813		}
814		else
815		{
816			return srcA << shiftL;
817		}
818	}
819	else
820	{
821		uint32 shiftL = ( uint32 )( srcBbpA - dstBbpA );
822		int32 addL = 1L << ( shiftL - 1 );
823		if( srcA + addL < addL )
824		{
825			/* rounding would cause overflow */
826			return srcA >> shiftL;
827		}
828		else
829		{
830			return ( srcA + addL ) >> shiftL;
831		}
832	}
833}
834
835/* ------------------------------------------------------------------------- */
836
837int32 bbs_vecPowerFlt16( const int16 *xA, int16 nxA )
838{
839/*	#if defined( HW_TMS320C5x )
840		uint32 rL;
841		power( ( int16* ) xA, ( int32* ) &rL, nxA );  // does not work properly in DSPLib version 2.20.02
842		return ( rL >> 1 );
843	#else*/
844		/* needs to be optimized */
845		int32 rL = 0;
846		for( ; nxA--; )
847		{
848			rL += ( int32 ) *xA * *xA;
849			xA++;
850		}
851		return rL;
852/*	#endif */
853}
854
855/* ------------------------------------------------------------------------- */
856
857void bbs_mulU32( uint32 v1A, uint32 v2A, uint32* manPtrA, int32* expPtrA )
858{
859	uint32 log1L = bbs_intLog2( v1A );
860	uint32 log2L = bbs_intLog2( v2A );
861
862	if( log1L + log2L < 32 )
863	{
864		*manPtrA = v1A * v2A;
865		*expPtrA = 0;
866	}
867	else
868	{
869		uint32 v1L = v1A;
870		uint32 v2L = v2A;
871		uint32 exp1L = 0;
872		uint32 exp2L = 0;
873		if( log1L > 15 && log2L > 15 )
874		{
875			exp1L = log1L - 15;
876			exp2L = log2L - 15;
877			v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1;
878			v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1;
879		}
880		else if( log1L > 15 )
881		{
882			exp1L = log1L + log2L - 31;
883			v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1;
884		}
885		else
886		{
887			exp2L = log1L + log2L - 31;
888			v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1;
889		}
890
891		*manPtrA = v1L * v2L;
892		*expPtrA = exp1L + exp2L;
893	}
894}
895
896/* ------------------------------------------------------------------------- */
897
898void bbs_mulS32( int32 v1A, int32 v2A, int32* manPtrA, int32* expPtrA )
899{
900	uint32 log1L = bbs_intLog2( v1A > 0 ? v1A : -v1A );
901	uint32 log2L = bbs_intLog2( v2A > 0 ? v2A : -v2A );
902
903	if( log1L + log2L < 30 )
904	{
905		*manPtrA = v1A * v2A;
906		*expPtrA = 0;
907	}
908	else
909	{
910		int32 v1L = v1A;
911		int32 v2L = v2A;
912		int32 exp1L = 0;
913		int32 exp2L = 0;
914		if( log1L > 14 && log2L > 14 )
915		{
916			exp1L = log1L - 14;
917			exp2L = log2L - 14;
918			v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1;
919			v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1;
920		}
921		else if( log1L > 14 )
922		{
923			exp1L = log1L + log2L - 29;
924			v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1;
925		}
926		else
927		{
928			exp2L = log1L + log2L - 29;
929			v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1;
930		}
931
932		*manPtrA = v1L * v2L;
933		*expPtrA = exp1L + exp2L;
934	}
935}
936
937/* ------------------------------------------------------------------------- */
938
939void bbs_vecSqrNorm32( const int32* vecA, uint32 sizeA, uint32* manPtrA, uint32* expPtrA )
940{
941	uint32 sumL = 0;
942	int32 sumExpL = 0;
943
944	uint32 iL;
945	for( iL = 0; iL < sizeA; iL++ )
946	{
947		int32 vL = vecA[ iL ];
948		int32 logL = bbs_intLog2( vL > 0 ? vL : -vL );
949		int32 expL = ( logL > 14 ) ? logL - 14 : 0;
950		uint32 prdL;
951
952		if( expL >= 1 )
953		{
954			vL = ( ( vL >> ( expL - 1 ) ) + 1 ) >> 1;
955		}
956		else
957		{
958			vL = vL >> expL;
959		}
960
961		prdL = vL * vL;
962		expL <<= 1; /* now exponent of product */
963
964		if( sumExpL > expL )
965		{
966			uint32 shrL = sumExpL - expL;
967			prdL = ( ( prdL >> ( shrL - 1 ) ) + 1 ) >> 1;
968		}
969		else if( expL > sumExpL )
970		{
971			uint32 shrL = expL - sumExpL;
972			sumL = ( ( sumL >> ( shrL - 1 ) ) + 1 ) >> 1;
973			sumExpL += shrL;
974		}
975
976		sumL += prdL;
977
978		if( sumL > 0x80000000 )
979		{
980			sumL = ( sumL + 1 ) >> 1;
981			sumExpL++;
982		}
983	}
984
985	/* make exponent even */
986	if( ( sumExpL & 1 ) != 0 )
987	{
988		sumL = ( sumL + 1 ) >> 1;
989		sumExpL++;
990	}
991
992	if( manPtrA != NULL ) *manPtrA = sumL;
993	if( expPtrA != NULL ) *expPtrA = sumExpL;
994}
995
996/* ------------------------------------------------------------------------- */
997
998void bbs_vecSqrNorm16( const int16* vecA, uint32 sizeA, uint32* manPtrA, uint32* expPtrA )
999{
1000	uint32 sumL = 0;
1001	int32 sumExpL = 0;
1002
1003	uint32 iL;
1004	for( iL = 0; iL < sizeA; iL++ )
1005	{
1006		int32 vL = vecA[ iL ];
1007		uint32 prdL = vL * vL;
1008
1009		if( sumExpL > 0 )
1010		{
1011			uint32 shrL = sumExpL;
1012			prdL = ( ( prdL >> ( shrL - 1 ) ) + 1 ) >> 1;
1013		}
1014
1015		sumL += prdL;
1016
1017		if( sumL > 0x80000000 )
1018		{
1019			sumL = ( sumL + 1 ) >> 1;
1020			sumExpL++;
1021		}
1022	}
1023
1024	/* make exponent even */
1025	if( ( sumExpL & 1 ) != 0 )
1026	{
1027		sumL = ( sumL + 1 ) >> 1;
1028		sumExpL++;
1029	}
1030
1031	if( manPtrA != NULL ) *manPtrA = sumL;
1032	if( expPtrA != NULL ) *expPtrA = sumExpL;
1033}
1034
1035/* ------------------------------------------------------------------------- */
1036
1037uint32 bbs_vecNorm16( const int16* vecA, uint32 sizeA )
1038{
1039	uint32 manL;
1040	uint32 expL;
1041	bbs_vecSqrNorm16( vecA, sizeA, &manL, &expL );
1042	manL = bbs_sqrt32( manL );
1043	return manL << ( expL >> 1 );
1044}
1045
1046/* ------------------------------------------------------------------------- */
1047
1048void bbs_matMultiplyFlt16( const int16 *x1A, int16 row1A, int16 col1A, const int16 *x2A, int16 col2A, int16 *rA )
1049{
1050	#if defined( HW_TMS320C5x )
1051		/* operands need to be in internal memory for mmul*/
1052		if( x1A > ( int16* ) bbs_C5X_INTERNAL_MEMORY_SIZE ||
1053			x2A > ( int16* ) bbs_C5X_INTERNAL_MEMORY_SIZE )
1054		{
1055			int16 iL,jL,kL;
1056			int16 *ptr1L, *ptr2L;
1057			int32 sumL;
1058
1059			for( iL = 0; iL < row1A; iL++ )
1060			{
1061				for( jL = 0; jL < col2A; jL++ )
1062				{
1063					ptr1L = ( int16* ) x1A + iL * col1A;
1064					ptr2L = ( int16* ) x2A + jL;
1065					sumL = 0;
1066					for( kL = 0; kL < col1A; kL++ )
1067					{
1068						sumL += ( ( int32 ) *ptr1L++ * *ptr2L );
1069						ptr2L += col2A;
1070					}
1071					*rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */
1072				}
1073			}
1074		}
1075		else mmul( ( int16* ) x1A, row1A, col1A, ( int16* ) x2A, col1A, col2A, rA );
1076
1077	#elif defined( HW_ARMv4 ) || defined( HW_ARMv5TE )
1078
1079		int32 iL, jL, kL;
1080		int16 *ptr1L, *ptr2L;
1081		int32 sumL;
1082		for( iL = 0; iL < row1A; iL++ )
1083		{
1084			for( jL = 0; jL < col2A; jL++ )
1085			{
1086				ptr1L = ( int16* ) x1A + iL * col1A;
1087				ptr2L = ( int16* ) x2A + jL;
1088				sumL = 0;
1089				for( kL = col1A; kL >= 4; kL -= 4 )
1090				{
1091					sumL += ( ( int32 ) *ptr1L++ * *ptr2L );
1092					sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) );
1093					sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) );
1094					sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) );
1095					ptr2L += col2A;
1096				}
1097				for( ; kL > 0; kL-- )
1098				{
1099					sumL += ( ( int32 ) *ptr1L++ * *ptr2L );
1100					ptr2L += col2A;
1101				}
1102				*rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */
1103			}
1104		}
1105	#else
1106		/* needs to be optimized */
1107		int16 iL,jL,kL;
1108		int16 *ptr1L, *ptr2L;
1109		int32 sumL;
1110
1111		for( iL = 0; iL < row1A; iL++ )
1112		{
1113			for( jL = 0; jL < col2A; jL++ )
1114			{
1115				ptr1L = ( int16* ) x1A + iL * col1A;
1116				ptr2L = ( int16* ) x2A + jL;
1117				sumL = 0;
1118				for( kL = 0; kL < col1A; kL++ )
1119				{
1120					sumL += ( ( int32 ) *ptr1L++ * *ptr2L );
1121					ptr2L += col2A;
1122				}
1123				*rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */
1124			}
1125		}
1126	#endif
1127}
1128
1129/* ------------------------------------------------------------------------- */
1130
1131void bbs_matMultiplyTranspFlt16( const int16 *x1A, int16 row1A, int16 col1A,
1132								 const int16 *x2A, int16 col2A, int16 *rA )
1133{
1134	const int16* ptr1L = x1A;
1135
1136	int32 iL;
1137	for( iL = row1A; iL > 0 ; iL-- )
1138	{
1139		int32 jL;
1140		const int16* ptr2L = x2A;
1141		for( jL = col2A; jL > 0 ; jL-- )
1142		{
1143			int32 kL;
1144			int32 sumL = 0;
1145			for( kL = col1A >> 2; kL > 0; kL-- )
1146			{
1147				sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
1148				sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
1149				sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
1150				sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
1151			}
1152			for( kL = col1A & 3; kL > 0; kL-- )
1153			{
1154				sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
1155			}
1156			*rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */
1157			ptr1L -= col1A;
1158		}
1159		ptr1L += col1A;
1160	}
1161}
1162
1163/* ------------------------------------------------------------------------- */
1164
1165
1166#ifndef mtrans
1167uint16 bbs_matTrans( int16 *xA, int16 rowA, int16 colA, int16 *rA )
1168{
1169	/* needs to be optimized */
1170	int16 iL;
1171	for( iL = colA; iL--; )
1172	{
1173		int16* sL = xA++;
1174		int16 jL;
1175		for( jL = rowA; jL--; )
1176		{
1177			*rA++ = *sL;
1178			sL += colA;
1179		}
1180	}
1181	return 0;
1182}
1183#endif
1184
1185/* ------------------------------------------------------------------------- */
1186#ifndef atan2_16
1187int16 bbs_atan2( int16 nomA, int16 denomA )
1188{
1189	int16 phL, argL;
1190
1191	if( nomA == denomA ) return 8192;
1192	argL = ( ( int32 ) nomA << 15 ) / denomA;
1193
1194	/* 0.318253*2 x      20857 .15
1195	  +0.003314*2 x^2      217 .15
1196	  -0.130908*2 x^3    -8580 .15
1197	  +0.068542*2 x^4     4491 .15
1198	  -0.009159*2 x^5     -600 .15 */
1199
1200	phL = -600;
1201	phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 4481;
1202	phL = ( ( ( int32 ) phL * argL ) >> 15 ) - 8580;
1203	phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 217;
1204	phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 20857;
1205	phL = ( ( int32 ) phL * argL ) >> 15;
1206
1207	return phL >> 1; /* /2 */
1208}
1209
1210/* needs to be optimized */
1211uint16 bbs_vecPhase( int16 *reA, int16 *imA, int16 *phaseA, uint16 sizeA )
1212{
1213	for( ; sizeA--; )
1214	{
1215		int16 reL = *reA++;
1216		int16 imL = *imA++;
1217		int16 phL = 0;
1218
1219		if( reL < 0 )
1220		{
1221			reL = -reL;
1222			if( imL < 0 )
1223			{
1224				imL = -imL;
1225				if( reL > imL )
1226				{
1227					phL = -32768 + bbs_atan2( imL, reL );
1228				}
1229				else
1230				{
1231					phL = -16384 - bbs_atan2( reL, imL );
1232				}
1233			}
1234			else
1235			{
1236				if( reL > imL )
1237				{
1238					phL = -( -32768 + bbs_atan2( imL, reL ) );
1239				}
1240				else
1241				{
1242					if( imL == 0 ) phL = 0;
1243					else phL = 16384 + bbs_atan2( reL, imL );
1244				}
1245			}
1246		}
1247		else
1248		{
1249			if( imL < 0 )
1250			{
1251				imL = -imL;
1252				if( reL > imL )
1253				{
1254					phL = -bbs_atan2( imL, reL );
1255				}
1256				else
1257				{
1258					phL = -16384 + bbs_atan2( reL, imL );
1259				}
1260			}
1261			else
1262			{
1263				if( reL > imL )
1264				{
1265					phL = bbs_atan2( imL, reL );
1266				}
1267				else
1268				{
1269					if( imL == 0 ) phL = 0;
1270					else phL = 16384 - bbs_atan2( reL, imL );
1271				}
1272			}
1273		}
1274
1275		*phaseA++ = phL;
1276	}
1277	return 0;
1278}
1279
1280#endif
1281
1282/* ------------------------------------------------------------------------- */
1283