1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "precomp.hpp"
44
45 #undef HAVE_IPP
46
47 namespace cv { namespace hal {
48
49 ///////////////////////////////////// ATAN2 ////////////////////////////////////
50 static const float atan2_p1 = 0.9997878412794807f*(float)(180/CV_PI);
51 static const float atan2_p3 = -0.3258083974640975f*(float)(180/CV_PI);
52 static const float atan2_p5 = 0.1555786518463281f*(float)(180/CV_PI);
53 static const float atan2_p7 = -0.04432655554792128f*(float)(180/CV_PI);
54
55 #if CV_NEON
cv_vrecpq_f32(float32x4_t val)56 static inline float32x4_t cv_vrecpq_f32(float32x4_t val)
57 {
58 float32x4_t reciprocal = vrecpeq_f32(val);
59 reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
60 reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
61 return reciprocal;
62 }
63 #endif
64
fastAtan2(const float * Y,const float * X,float * angle,int len,bool angleInDegrees)65 void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
66 {
67 int i = 0;
68 float scale = angleInDegrees ? 1 : (float)(CV_PI/180);
69
70 #ifdef HAVE_TEGRA_OPTIMIZATION
71 if (tegra::useTegra() && tegra::FastAtan2_32f(Y, X, angle, len, scale))
72 return;
73 #endif
74
75 #if CV_SSE2
76 Cv32suf iabsmask; iabsmask.i = 0x7fffffff;
77 __m128 eps = _mm_set1_ps((float)DBL_EPSILON), absmask = _mm_set1_ps(iabsmask.f);
78 __m128 _90 = _mm_set1_ps(90.f), _180 = _mm_set1_ps(180.f), _360 = _mm_set1_ps(360.f);
79 __m128 z = _mm_setzero_ps(), scale4 = _mm_set1_ps(scale);
80 __m128 p1 = _mm_set1_ps(atan2_p1), p3 = _mm_set1_ps(atan2_p3);
81 __m128 p5 = _mm_set1_ps(atan2_p5), p7 = _mm_set1_ps(atan2_p7);
82
83 for( ; i <= len - 4; i += 4 )
84 {
85 __m128 x = _mm_loadu_ps(X + i), y = _mm_loadu_ps(Y + i);
86 __m128 ax = _mm_and_ps(x, absmask), ay = _mm_and_ps(y, absmask);
87 __m128 mask = _mm_cmplt_ps(ax, ay);
88 __m128 tmin = _mm_min_ps(ax, ay), tmax = _mm_max_ps(ax, ay);
89 __m128 c = _mm_div_ps(tmin, _mm_add_ps(tmax, eps));
90 __m128 c2 = _mm_mul_ps(c, c);
91 __m128 a = _mm_mul_ps(c2, p7);
92 a = _mm_mul_ps(_mm_add_ps(a, p5), c2);
93 a = _mm_mul_ps(_mm_add_ps(a, p3), c2);
94 a = _mm_mul_ps(_mm_add_ps(a, p1), c);
95
96 __m128 b = _mm_sub_ps(_90, a);
97 a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
98
99 b = _mm_sub_ps(_180, a);
100 mask = _mm_cmplt_ps(x, z);
101 a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
102
103 b = _mm_sub_ps(_360, a);
104 mask = _mm_cmplt_ps(y, z);
105 a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
106
107 a = _mm_mul_ps(a, scale4);
108 _mm_storeu_ps(angle + i, a);
109 }
110 #elif CV_NEON
111 float32x4_t eps = vdupq_n_f32((float)DBL_EPSILON);
112 float32x4_t _90 = vdupq_n_f32(90.f), _180 = vdupq_n_f32(180.f), _360 = vdupq_n_f32(360.f);
113 float32x4_t z = vdupq_n_f32(0.0f), scale4 = vdupq_n_f32(scale);
114 float32x4_t p1 = vdupq_n_f32(atan2_p1), p3 = vdupq_n_f32(atan2_p3);
115 float32x4_t p5 = vdupq_n_f32(atan2_p5), p7 = vdupq_n_f32(atan2_p7);
116
117 for( ; i <= len - 4; i += 4 )
118 {
119 float32x4_t x = vld1q_f32(X + i), y = vld1q_f32(Y + i);
120 float32x4_t ax = vabsq_f32(x), ay = vabsq_f32(y);
121 float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay);
122 float32x4_t c = vmulq_f32(tmin, cv_vrecpq_f32(vaddq_f32(tmax, eps)));
123 float32x4_t c2 = vmulq_f32(c, c);
124 float32x4_t a = vmulq_f32(c2, p7);
125 a = vmulq_f32(vaddq_f32(a, p5), c2);
126 a = vmulq_f32(vaddq_f32(a, p3), c2);
127 a = vmulq_f32(vaddq_f32(a, p1), c);
128
129 a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a));
130 a = vbslq_f32(vcltq_f32(x, z), vsubq_f32(_180, a), a);
131 a = vbslq_f32(vcltq_f32(y, z), vsubq_f32(_360, a), a);
132
133 vst1q_f32(angle + i, vmulq_f32(a, scale4));
134 }
135 #endif
136
137 for( ; i < len; i++ )
138 {
139 float x = X[i], y = Y[i];
140 float ax = std::abs(x), ay = std::abs(y);
141 float a, c, c2;
142 if( ax >= ay )
143 {
144 c = ay/(ax + (float)DBL_EPSILON);
145 c2 = c*c;
146 a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
147 }
148 else
149 {
150 c = ax/(ay + (float)DBL_EPSILON);
151 c2 = c*c;
152 a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
153 }
154 if( x < 0 )
155 a = 180.f - a;
156 if( y < 0 )
157 a = 360.f - a;
158 angle[i] = (float)(a*scale);
159 }
160 }
161
162
magnitude(const float * x,const float * y,float * mag,int len)163 void magnitude(const float* x, const float* y, float* mag, int len)
164 {
165 #if defined HAVE_IPP
166 CV_IPP_CHECK()
167 {
168 IppStatus status = ippsMagnitude_32f(x, y, mag, len);
169 if (status >= 0)
170 {
171 CV_IMPL_ADD(CV_IMPL_IPP);
172 return;
173 }
174 setIppErrorStatus();
175 }
176 #endif
177
178 int i = 0;
179
180 #if CV_SIMD128
181 for( ; i <= len - 8; i += 8 )
182 {
183 v_float32x4 x0 = v_load(x + i), x1 = v_load(x + i + 4);
184 v_float32x4 y0 = v_load(y + i), y1 = v_load(y + i + 4);
185 x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
186 x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
187 v_store(mag + i, x0);
188 v_store(mag + i + 4, x1);
189 }
190 #endif
191
192 for( ; i < len; i++ )
193 {
194 float x0 = x[i], y0 = y[i];
195 mag[i] = std::sqrt(x0*x0 + y0*y0);
196 }
197 }
198
magnitude(const double * x,const double * y,double * mag,int len)199 void magnitude(const double* x, const double* y, double* mag, int len)
200 {
201 #if defined(HAVE_IPP)
202 CV_IPP_CHECK()
203 {
204 IppStatus status = ippsMagnitude_64f(x, y, mag, len);
205 if (status >= 0)
206 {
207 CV_IMPL_ADD(CV_IMPL_IPP);
208 return;
209 }
210 setIppErrorStatus();
211 }
212 #endif
213
214 int i = 0;
215
216 #if CV_SIMD128_64F
217 for( ; i <= len - 4; i += 4 )
218 {
219 v_float64x2 x0 = v_load(x + i), x1 = v_load(x + i + 2);
220 v_float64x2 y0 = v_load(y + i), y1 = v_load(y + i + 2);
221 x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
222 x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
223 v_store(mag + i, x0);
224 v_store(mag + i + 2, x1);
225 }
226 #endif
227
228 for( ; i < len; i++ )
229 {
230 double x0 = x[i], y0 = y[i];
231 mag[i] = std::sqrt(x0*x0 + y0*y0);
232 }
233 }
234
235
invSqrt(const float * src,float * dst,int len)236 void invSqrt(const float* src, float* dst, int len)
237 {
238 #if defined(HAVE_IPP)
239 CV_IPP_CHECK()
240 {
241 if (ippsInvSqrt_32f_A21(src, dst, len) >= 0)
242 {
243 CV_IMPL_ADD(CV_IMPL_IPP);
244 return;
245 }
246 setIppErrorStatus();
247 }
248 #endif
249
250 int i = 0;
251
252 #if CV_SIMD128
253 for( ; i <= len - 8; i += 8 )
254 {
255 v_float32x4 t0 = v_load(src + i), t1 = v_load(src + i + 4);
256 t0 = v_invsqrt(t0);
257 t1 = v_invsqrt(t1);
258 v_store(dst + i, t0); v_store(dst + i + 4, t1);
259 }
260 #endif
261
262 for( ; i < len; i++ )
263 dst[i] = 1/std::sqrt(src[i]);
264 }
265
266
invSqrt(const double * src,double * dst,int len)267 void invSqrt(const double* src, double* dst, int len)
268 {
269 int i = 0;
270
271 #if CV_SSE2
272 __m128d v_1 = _mm_set1_pd(1.0);
273 for ( ; i <= len - 2; i += 2)
274 _mm_storeu_pd(dst + i, _mm_div_pd(v_1, _mm_sqrt_pd(_mm_loadu_pd(src + i))));
275 #endif
276
277 for( ; i < len; i++ )
278 dst[i] = 1/std::sqrt(src[i]);
279 }
280
281
sqrt(const float * src,float * dst,int len)282 void sqrt(const float* src, float* dst, int len)
283 {
284 #if defined(HAVE_IPP)
285 CV_IPP_CHECK()
286 {
287 if (ippsSqrt_32f_A21(src, dst, len) >= 0)
288 {
289 CV_IMPL_ADD(CV_IMPL_IPP);
290 return;
291 }
292 setIppErrorStatus();
293 }
294 #endif
295
296 int i = 0;
297
298 #if CV_SIMD128
299 for( ; i <= len - 8; i += 8 )
300 {
301 v_float32x4 t0 = v_load(src + i), t1 = v_load(src + i + 4);
302 t0 = v_sqrt(t0);
303 t1 = v_sqrt(t1);
304 v_store(dst + i, t0); v_store(dst + i + 4, t1);
305 }
306 #endif
307
308 for( ; i < len; i++ )
309 dst[i] = std::sqrt(src[i]);
310 }
311
312
sqrt(const double * src,double * dst,int len)313 void sqrt(const double* src, double* dst, int len)
314 {
315 #if defined(HAVE_IPP)
316 CV_IPP_CHECK()
317 {
318 if (ippsSqrt_64f_A50(src, dst, len) >= 0)
319 {
320 CV_IMPL_ADD(CV_IMPL_IPP);
321 return;
322 }
323 setIppErrorStatus();
324 }
325 #endif
326
327 int i = 0;
328
329 #if CV_SIMD128_64F
330 for( ; i <= len - 4; i += 4 )
331 {
332 v_float64x2 t0 = v_load(src + i), t1 = v_load(src + i + 2);
333 t0 = v_sqrt(t0);
334 t1 = v_sqrt(t1);
335 v_store(dst + i, t0); v_store(dst + i + 2, t1);
336 }
337 #endif
338
339 for( ; i < len; i++ )
340 dst[i] = std::sqrt(src[i]);
341 }
342
343 ////////////////////////////////////// EXP /////////////////////////////////////
344
345 typedef union
346 {
347 struct {
348 #if ( defined( WORDS_BIGENDIAN ) && !defined( OPENCV_UNIVERSAL_BUILD ) ) || defined( __BIG_ENDIAN__ )
349 int hi;
350 int lo;
351 #else
352 int lo;
353 int hi;
354 #endif
355 } i;
356 double d;
357 }
358 DBLINT;
359
360 #define EXPTAB_SCALE 6
361 #define EXPTAB_MASK ((1 << EXPTAB_SCALE) - 1)
362
363 #define EXPPOLY_32F_A0 .9670371139572337719125840413672004409288e-2
364
365 static const double expTab[] = {
366 1.0 * EXPPOLY_32F_A0,
367 1.0108892860517004600204097905619 * EXPPOLY_32F_A0,
368 1.0218971486541166782344801347833 * EXPPOLY_32F_A0,
369 1.0330248790212284225001082839705 * EXPPOLY_32F_A0,
370 1.0442737824274138403219664787399 * EXPPOLY_32F_A0,
371 1.0556451783605571588083413251529 * EXPPOLY_32F_A0,
372 1.0671404006768236181695211209928 * EXPPOLY_32F_A0,
373 1.0787607977571197937406800374385 * EXPPOLY_32F_A0,
374 1.0905077326652576592070106557607 * EXPPOLY_32F_A0,
375 1.1023825833078409435564142094256 * EXPPOLY_32F_A0,
376 1.1143867425958925363088129569196 * EXPPOLY_32F_A0,
377 1.126521618608241899794798643787 * EXPPOLY_32F_A0,
378 1.1387886347566916537038302838415 * EXPPOLY_32F_A0,
379 1.151189229952982705817759635202 * EXPPOLY_32F_A0,
380 1.1637248587775775138135735990922 * EXPPOLY_32F_A0,
381 1.1763969916502812762846457284838 * EXPPOLY_32F_A0,
382 1.1892071150027210667174999705605 * EXPPOLY_32F_A0,
383 1.2021567314527031420963969574978 * EXPPOLY_32F_A0,
384 1.2152473599804688781165202513388 * EXPPOLY_32F_A0,
385 1.2284805361068700056940089577928 * EXPPOLY_32F_A0,
386 1.2418578120734840485936774687266 * EXPPOLY_32F_A0,
387 1.2553807570246910895793906574423 * EXPPOLY_32F_A0,
388 1.2690509571917332225544190810323 * EXPPOLY_32F_A0,
389 1.2828700160787782807266697810215 * EXPPOLY_32F_A0,
390 1.2968395546510096659337541177925 * EXPPOLY_32F_A0,
391 1.3109612115247643419229917863308 * EXPPOLY_32F_A0,
392 1.3252366431597412946295370954987 * EXPPOLY_32F_A0,
393 1.3396675240533030053600306697244 * EXPPOLY_32F_A0,
394 1.3542555469368927282980147401407 * EXPPOLY_32F_A0,
395 1.3690024229745906119296011329822 * EXPPOLY_32F_A0,
396 1.3839098819638319548726595272652 * EXPPOLY_32F_A0,
397 1.3989796725383111402095281367152 * EXPPOLY_32F_A0,
398 1.4142135623730950488016887242097 * EXPPOLY_32F_A0,
399 1.4296133383919700112350657782751 * EXPPOLY_32F_A0,
400 1.4451808069770466200370062414717 * EXPPOLY_32F_A0,
401 1.4609177941806469886513028903106 * EXPPOLY_32F_A0,
402 1.476826145939499311386907480374 * EXPPOLY_32F_A0,
403 1.4929077282912648492006435314867 * EXPPOLY_32F_A0,
404 1.5091644275934227397660195510332 * EXPPOLY_32F_A0,
405 1.5255981507445383068512536895169 * EXPPOLY_32F_A0,
406 1.5422108254079408236122918620907 * EXPPOLY_32F_A0,
407 1.5590044002378369670337280894749 * EXPPOLY_32F_A0,
408 1.5759808451078864864552701601819 * EXPPOLY_32F_A0,
409 1.5931421513422668979372486431191 * EXPPOLY_32F_A0,
410 1.6104903319492543081795206673574 * EXPPOLY_32F_A0,
411 1.628027421857347766848218522014 * EXPPOLY_32F_A0,
412 1.6457554781539648445187567247258 * EXPPOLY_32F_A0,
413 1.6636765803267364350463364569764 * EXPPOLY_32F_A0,
414 1.6817928305074290860622509524664 * EXPPOLY_32F_A0,
415 1.7001063537185234695013625734975 * EXPPOLY_32F_A0,
416 1.7186192981224779156293443764563 * EXPPOLY_32F_A0,
417 1.7373338352737062489942020818722 * EXPPOLY_32F_A0,
418 1.7562521603732994831121606193753 * EXPPOLY_32F_A0,
419 1.7753764925265212525505592001993 * EXPPOLY_32F_A0,
420 1.7947090750031071864277032421278 * EXPPOLY_32F_A0,
421 1.8142521755003987562498346003623 * EXPPOLY_32F_A0,
422 1.8340080864093424634870831895883 * EXPPOLY_32F_A0,
423 1.8539791250833855683924530703377 * EXPPOLY_32F_A0,
424 1.8741676341102999013299989499544 * EXPPOLY_32F_A0,
425 1.8945759815869656413402186534269 * EXPPOLY_32F_A0,
426 1.9152065613971472938726112702958 * EXPPOLY_32F_A0,
427 1.9360617934922944505980559045667 * EXPPOLY_32F_A0,
428 1.9571441241754002690183222516269 * EXPPOLY_32F_A0,
429 1.9784560263879509682582499181312 * EXPPOLY_32F_A0,
430 };
431
432
433 // the code below uses _mm_cast* intrinsics, which are not avialable on VS2005
434 #if (defined _MSC_VER && _MSC_VER < 1500) || \
435 (!defined __APPLE__ && defined __GNUC__ && __GNUC__*100 + __GNUC_MINOR__ < 402)
436 #undef CV_SSE2
437 #define CV_SSE2 0
438 #endif
439
440 static const double exp_prescale = 1.4426950408889634073599246810019 * (1 << EXPTAB_SCALE);
441 static const double exp_postscale = 1./(1 << EXPTAB_SCALE);
442 static const double exp_max_val = 3000.*(1 << EXPTAB_SCALE); // log10(DBL_MAX) < 3000
443
exp(const float * _x,float * y,int n)444 void exp( const float *_x, float *y, int n )
445 {
446 static const float
447 A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),
448 A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0),
449 A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0),
450 A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0);
451
452 #undef EXPPOLY
453 #define EXPPOLY(x) \
454 (((((x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)
455
456 int i = 0;
457 const Cv32suf* x = (const Cv32suf*)_x;
458 Cv32suf buf[4];
459
460 #if CV_SSE2
461 if( n >= 8 )
462 {
463 static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
464 static const __m128 postscale4 = _mm_set1_ps((float)exp_postscale);
465 static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale));
466 static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale));
467
468 static const __m128 mA1 = _mm_set1_ps(A1);
469 static const __m128 mA2 = _mm_set1_ps(A2);
470 static const __m128 mA3 = _mm_set1_ps(A3);
471 static const __m128 mA4 = _mm_set1_ps(A4);
472 bool y_aligned = (size_t)(void*)y % 16 == 0;
473
474 ushort CV_DECL_ALIGNED(16) tab_idx[8];
475
476 for( ; i <= n - 8; i += 8 )
477 {
478 __m128 xf0, xf1;
479 xf0 = _mm_loadu_ps(&x[i].f);
480 xf1 = _mm_loadu_ps(&x[i+4].f);
481 __m128i xi0, xi1, xi2, xi3;
482
483 xf0 = _mm_min_ps(_mm_max_ps(xf0, minval4), maxval4);
484 xf1 = _mm_min_ps(_mm_max_ps(xf1, minval4), maxval4);
485
486 __m128d xd0 = _mm_cvtps_pd(xf0);
487 __m128d xd2 = _mm_cvtps_pd(_mm_movehl_ps(xf0, xf0));
488 __m128d xd1 = _mm_cvtps_pd(xf1);
489 __m128d xd3 = _mm_cvtps_pd(_mm_movehl_ps(xf1, xf1));
490
491 xd0 = _mm_mul_pd(xd0, prescale2);
492 xd2 = _mm_mul_pd(xd2, prescale2);
493 xd1 = _mm_mul_pd(xd1, prescale2);
494 xd3 = _mm_mul_pd(xd3, prescale2);
495
496 xi0 = _mm_cvtpd_epi32(xd0);
497 xi2 = _mm_cvtpd_epi32(xd2);
498
499 xi1 = _mm_cvtpd_epi32(xd1);
500 xi3 = _mm_cvtpd_epi32(xd3);
501
502 xd0 = _mm_sub_pd(xd0, _mm_cvtepi32_pd(xi0));
503 xd2 = _mm_sub_pd(xd2, _mm_cvtepi32_pd(xi2));
504 xd1 = _mm_sub_pd(xd1, _mm_cvtepi32_pd(xi1));
505 xd3 = _mm_sub_pd(xd3, _mm_cvtepi32_pd(xi3));
506
507 xf0 = _mm_movelh_ps(_mm_cvtpd_ps(xd0), _mm_cvtpd_ps(xd2));
508 xf1 = _mm_movelh_ps(_mm_cvtpd_ps(xd1), _mm_cvtpd_ps(xd3));
509
510 xf0 = _mm_mul_ps(xf0, postscale4);
511 xf1 = _mm_mul_ps(xf1, postscale4);
512
513 xi0 = _mm_unpacklo_epi64(xi0, xi2);
514 xi1 = _mm_unpacklo_epi64(xi1, xi3);
515 xi0 = _mm_packs_epi32(xi0, xi1);
516
517 _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK)));
518
519 xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127));
520 xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
521 xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255));
522 xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128());
523 xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
524
525 __m128d yd0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1]));
526 __m128d yd1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3]));
527 __m128d yd2 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[4]), _mm_load_sd(expTab + tab_idx[5]));
528 __m128d yd3 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[6]), _mm_load_sd(expTab + tab_idx[7]));
529
530 __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1));
531 __m128 yf1 = _mm_movelh_ps(_mm_cvtpd_ps(yd2), _mm_cvtpd_ps(yd3));
532
533 yf0 = _mm_mul_ps(yf0, _mm_castsi128_ps(_mm_slli_epi32(xi0, 23)));
534 yf1 = _mm_mul_ps(yf1, _mm_castsi128_ps(_mm_slli_epi32(xi1, 23)));
535
536 __m128 zf0 = _mm_add_ps(xf0, mA1);
537 __m128 zf1 = _mm_add_ps(xf1, mA1);
538
539 zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA2);
540 zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA2);
541
542 zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA3);
543 zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA3);
544
545 zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA4);
546 zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA4);
547
548 zf0 = _mm_mul_ps(zf0, yf0);
549 zf1 = _mm_mul_ps(zf1, yf1);
550
551 if( y_aligned )
552 {
553 _mm_store_ps(y + i, zf0);
554 _mm_store_ps(y + i + 4, zf1);
555 }
556 else
557 {
558 _mm_storeu_ps(y + i, zf0);
559 _mm_storeu_ps(y + i + 4, zf1);
560 }
561 }
562 }
563 else
564 #endif
565 for( ; i <= n - 4; i += 4 )
566 {
567 double x0 = x[i].f * exp_prescale;
568 double x1 = x[i + 1].f * exp_prescale;
569 double x2 = x[i + 2].f * exp_prescale;
570 double x3 = x[i + 3].f * exp_prescale;
571 int val0, val1, val2, val3, t;
572
573 if( ((x[i].i >> 23) & 255) > 127 + 10 )
574 x0 = x[i].i < 0 ? -exp_max_val : exp_max_val;
575
576 if( ((x[i+1].i >> 23) & 255) > 127 + 10 )
577 x1 = x[i+1].i < 0 ? -exp_max_val : exp_max_val;
578
579 if( ((x[i+2].i >> 23) & 255) > 127 + 10 )
580 x2 = x[i+2].i < 0 ? -exp_max_val : exp_max_val;
581
582 if( ((x[i+3].i >> 23) & 255) > 127 + 10 )
583 x3 = x[i+3].i < 0 ? -exp_max_val : exp_max_val;
584
585 val0 = cvRound(x0);
586 val1 = cvRound(x1);
587 val2 = cvRound(x2);
588 val3 = cvRound(x3);
589
590 x0 = (x0 - val0)*exp_postscale;
591 x1 = (x1 - val1)*exp_postscale;
592 x2 = (x2 - val2)*exp_postscale;
593 x3 = (x3 - val3)*exp_postscale;
594
595 t = (val0 >> EXPTAB_SCALE) + 127;
596 t = !(t & ~255) ? t : t < 0 ? 0 : 255;
597 buf[0].i = t << 23;
598
599 t = (val1 >> EXPTAB_SCALE) + 127;
600 t = !(t & ~255) ? t : t < 0 ? 0 : 255;
601 buf[1].i = t << 23;
602
603 t = (val2 >> EXPTAB_SCALE) + 127;
604 t = !(t & ~255) ? t : t < 0 ? 0 : 255;
605 buf[2].i = t << 23;
606
607 t = (val3 >> EXPTAB_SCALE) + 127;
608 t = !(t & ~255) ? t : t < 0 ? 0 : 255;
609 buf[3].i = t << 23;
610
611 x0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
612 x1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
613
614 y[i] = (float)x0;
615 y[i + 1] = (float)x1;
616
617 x2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
618 x3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
619
620 y[i + 2] = (float)x2;
621 y[i + 3] = (float)x3;
622 }
623
624 for( ; i < n; i++ )
625 {
626 double x0 = x[i].f * exp_prescale;
627 int val0, t;
628
629 if( ((x[i].i >> 23) & 255) > 127 + 10 )
630 x0 = x[i].i < 0 ? -exp_max_val : exp_max_val;
631
632 val0 = cvRound(x0);
633 t = (val0 >> EXPTAB_SCALE) + 127;
634 t = !(t & ~255) ? t : t < 0 ? 0 : 255;
635
636 buf[0].i = t << 23;
637 x0 = (x0 - val0)*exp_postscale;
638
639 y[i] = (float)(buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY(x0));
640 }
641 }
642
exp(const double * _x,double * y,int n)643 void exp( const double *_x, double *y, int n )
644 {
645 static const double
646 A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0,
647 A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0,
648 A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0,
649 A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0,
650 A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0,
651 A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0;
652
653 #undef EXPPOLY
654 #define EXPPOLY(x) (((((A0*(x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)*(x) + A5)
655
656 int i = 0;
657 Cv64suf buf[4];
658 const Cv64suf* x = (const Cv64suf*)_x;
659
660 #if CV_SSE2
661 static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
662 static const __m128d postscale2 = _mm_set1_pd(exp_postscale);
663 static const __m128d maxval2 = _mm_set1_pd(exp_max_val);
664 static const __m128d minval2 = _mm_set1_pd(-exp_max_val);
665
666 static const __m128d mA0 = _mm_set1_pd(A0);
667 static const __m128d mA1 = _mm_set1_pd(A1);
668 static const __m128d mA2 = _mm_set1_pd(A2);
669 static const __m128d mA3 = _mm_set1_pd(A3);
670 static const __m128d mA4 = _mm_set1_pd(A4);
671 static const __m128d mA5 = _mm_set1_pd(A5);
672
673 int CV_DECL_ALIGNED(16) tab_idx[4];
674
675 for( ; i <= n - 4; i += 4 )
676 {
677 __m128d xf0 = _mm_loadu_pd(&x[i].f), xf1 = _mm_loadu_pd(&x[i+2].f);
678 __m128i xi0, xi1;
679 xf0 = _mm_min_pd(_mm_max_pd(xf0, minval2), maxval2);
680 xf1 = _mm_min_pd(_mm_max_pd(xf1, minval2), maxval2);
681 xf0 = _mm_mul_pd(xf0, prescale2);
682 xf1 = _mm_mul_pd(xf1, prescale2);
683
684 xi0 = _mm_cvtpd_epi32(xf0);
685 xi1 = _mm_cvtpd_epi32(xf1);
686 xf0 = _mm_mul_pd(_mm_sub_pd(xf0, _mm_cvtepi32_pd(xi0)), postscale2);
687 xf1 = _mm_mul_pd(_mm_sub_pd(xf1, _mm_cvtepi32_pd(xi1)), postscale2);
688
689 xi0 = _mm_unpacklo_epi64(xi0, xi1);
690 _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi32(EXPTAB_MASK)));
691
692 xi0 = _mm_add_epi32(_mm_srai_epi32(xi0, EXPTAB_SCALE), _mm_set1_epi32(1023));
693 xi0 = _mm_packs_epi32(xi0, xi0);
694 xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
695 xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(2047));
696 xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
697 xi1 = _mm_unpackhi_epi32(xi0, _mm_setzero_si128());
698 xi0 = _mm_unpacklo_epi32(xi0, _mm_setzero_si128());
699
700 __m128d yf0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1]));
701 __m128d yf1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3]));
702 yf0 = _mm_mul_pd(yf0, _mm_castsi128_pd(_mm_slli_epi64(xi0, 52)));
703 yf1 = _mm_mul_pd(yf1, _mm_castsi128_pd(_mm_slli_epi64(xi1, 52)));
704
705 __m128d zf0 = _mm_add_pd(_mm_mul_pd(mA0, xf0), mA1);
706 __m128d zf1 = _mm_add_pd(_mm_mul_pd(mA0, xf1), mA1);
707
708 zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA2);
709 zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA2);
710
711 zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA3);
712 zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA3);
713
714 zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA4);
715 zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA4);
716
717 zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA5);
718 zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA5);
719
720 zf0 = _mm_mul_pd(zf0, yf0);
721 zf1 = _mm_mul_pd(zf1, yf1);
722
723 _mm_storeu_pd(y + i, zf0);
724 _mm_storeu_pd(y + i + 2, zf1);
725 }
726 #endif
727 for( ; i <= n - 4; i += 4 )
728 {
729 double x0 = x[i].f * exp_prescale;
730 double x1 = x[i + 1].f * exp_prescale;
731 double x2 = x[i + 2].f * exp_prescale;
732 double x3 = x[i + 3].f * exp_prescale;
733
734 double y0, y1, y2, y3;
735 int val0, val1, val2, val3, t;
736
737 t = (int)(x[i].i >> 52);
738 if( (t & 2047) > 1023 + 10 )
739 x0 = t < 0 ? -exp_max_val : exp_max_val;
740
741 t = (int)(x[i+1].i >> 52);
742 if( (t & 2047) > 1023 + 10 )
743 x1 = t < 0 ? -exp_max_val : exp_max_val;
744
745 t = (int)(x[i+2].i >> 52);
746 if( (t & 2047) > 1023 + 10 )
747 x2 = t < 0 ? -exp_max_val : exp_max_val;
748
749 t = (int)(x[i+3].i >> 52);
750 if( (t & 2047) > 1023 + 10 )
751 x3 = t < 0 ? -exp_max_val : exp_max_val;
752
753 val0 = cvRound(x0);
754 val1 = cvRound(x1);
755 val2 = cvRound(x2);
756 val3 = cvRound(x3);
757
758 x0 = (x0 - val0)*exp_postscale;
759 x1 = (x1 - val1)*exp_postscale;
760 x2 = (x2 - val2)*exp_postscale;
761 x3 = (x3 - val3)*exp_postscale;
762
763 t = (val0 >> EXPTAB_SCALE) + 1023;
764 t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
765 buf[0].i = (int64)t << 52;
766
767 t = (val1 >> EXPTAB_SCALE) + 1023;
768 t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
769 buf[1].i = (int64)t << 52;
770
771 t = (val2 >> EXPTAB_SCALE) + 1023;
772 t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
773 buf[2].i = (int64)t << 52;
774
775 t = (val3 >> EXPTAB_SCALE) + 1023;
776 t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
777 buf[3].i = (int64)t << 52;
778
779 y0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
780 y1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
781
782 y[i] = y0;
783 y[i + 1] = y1;
784
785 y2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
786 y3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
787
788 y[i + 2] = y2;
789 y[i + 3] = y3;
790 }
791
792 for( ; i < n; i++ )
793 {
794 double x0 = x[i].f * exp_prescale;
795 int val0, t;
796
797 t = (int)(x[i].i >> 52);
798 if( (t & 2047) > 1023 + 10 )
799 x0 = t < 0 ? -exp_max_val : exp_max_val;
800
801 val0 = cvRound(x0);
802 t = (val0 >> EXPTAB_SCALE) + 1023;
803 t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
804
805 buf[0].i = (int64)t << 52;
806 x0 = (x0 - val0)*exp_postscale;
807
808 y[i] = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
809 }
810 }
811
812 #undef EXPTAB_SCALE
813 #undef EXPTAB_MASK
814 #undef EXPPOLY_32F_A0
815
816 /////////////////////////////////////////// LOG ///////////////////////////////////////
817
818 #define LOGTAB_SCALE 8
819 #define LOGTAB_MASK ((1 << LOGTAB_SCALE) - 1)
820 #define LOGTAB_MASK2 ((1 << (20 - LOGTAB_SCALE)) - 1)
821 #define LOGTAB_MASK2_32F ((1 << (23 - LOGTAB_SCALE)) - 1)
822
823 static const double CV_DECL_ALIGNED(16) icvLogTab[] = {
824 0.0000000000000000000000000000000000000000, 1.000000000000000000000000000000000000000,
825 .00389864041565732288852075271279318258166, .9961089494163424124513618677042801556420,
826 .00778214044205494809292034119607706088573, .9922480620155038759689922480620155038760,
827 .01165061721997527263705585198749759001657, .9884169884169884169884169884169884169884,
828 .01550418653596525274396267235488267033361, .9846153846153846153846153846153846153846,
829 .01934296284313093139406447562578250654042, .9808429118773946360153256704980842911877,
830 .02316705928153437593630670221500622574241, .9770992366412213740458015267175572519084,
831 .02697658769820207233514075539915211265906, .9733840304182509505703422053231939163498,
832 .03077165866675368732785500469617545604706, .9696969696969696969696969696969696969697,
833 .03455238150665972812758397481047722976656, .9660377358490566037735849056603773584906,
834 .03831886430213659461285757856785494368522, .9624060150375939849624060150375939849624,
835 .04207121392068705056921373852674150839447, .9588014981273408239700374531835205992509,
836 .04580953603129420126371940114040626212953, .9552238805970149253731343283582089552239,
837 .04953393512227662748292900118940451648088, .9516728624535315985130111524163568773234,
838 .05324451451881227759255210685296333394944, .9481481481481481481481481481481481481481,
839 .05694137640013842427411105973078520037234, .9446494464944649446494464944649446494465,
840 .06062462181643483993820353816772694699466, .9411764705882352941176470588235294117647,
841 .06429435070539725460836422143984236754475, .9377289377289377289377289377289377289377,
842 .06795066190850773679699159401934593915938, .9343065693430656934306569343065693430657,
843 .07159365318700880442825962290953611955044, .9309090909090909090909090909090909090909,
844 .07522342123758751775142172846244648098944, .9275362318840579710144927536231884057971,
845 .07884006170777602129362549021607264876369, .9241877256317689530685920577617328519856,
846 .08244366921107458556772229485432035289706, .9208633093525179856115107913669064748201,
847 .08603433734180314373940490213499288074675, .9175627240143369175627240143369175627240,
848 .08961215868968712416897659522874164395031, .9142857142857142857142857142857142857143,
849 .09317722485418328259854092721070628613231, .9110320284697508896797153024911032028470,
850 .09672962645855109897752299730200320482256, .9078014184397163120567375886524822695035,
851 .10026945316367513738597949668474029749630, .9045936395759717314487632508833922261484,
852 .10379679368164355934833764649738441221420, .9014084507042253521126760563380281690141,
853 .10731173578908805021914218968959175981580, .8982456140350877192982456140350877192982,
854 .11081436634029011301105782649756292812530, .8951048951048951048951048951048951048951,
855 .11430477128005862852422325204315711744130, .8919860627177700348432055749128919860627,
856 .11778303565638344185817487641543266363440, .8888888888888888888888888888888888888889,
857 .12124924363286967987640707633545389398930, .8858131487889273356401384083044982698962,
858 .12470347850095722663787967121606925502420, .8827586206896551724137931034482758620690,
859 .12814582269193003360996385708858724683530, .8797250859106529209621993127147766323024,
860 .13157635778871926146571524895989568904040, .8767123287671232876712328767123287671233,
861 .13499516453750481925766280255629681050780, .8737201365187713310580204778156996587031,
862 .13840232285911913123754857224412262439730, .8707482993197278911564625850340136054422,
863 .14179791186025733629172407290752744302150, .8677966101694915254237288135593220338983,
864 .14518200984449788903951628071808954700830, .8648648648648648648648648648648648648649,
865 .14855469432313711530824207329715136438610, .8619528619528619528619528619528619528620,
866 .15191604202584196858794030049466527998450, .8590604026845637583892617449664429530201,
867 .15526612891112392955683674244937719777230, .8561872909698996655518394648829431438127,
868 .15860503017663857283636730244325008243330, .8533333333333333333333333333333333333333,
869 .16193282026931324346641360989451641216880, .8504983388704318936877076411960132890365,
870 .16524957289530714521497145597095368430010, .8476821192052980132450331125827814569536,
871 .16855536102980664403538924034364754334090, .8448844884488448844884488448844884488449,
872 .17185025692665920060697715143760433420540, .8421052631578947368421052631578947368421,
873 .17513433212784912385018287750426679849630, .8393442622950819672131147540983606557377,
874 .17840765747281828179637841458315961062910, .8366013071895424836601307189542483660131,
875 .18167030310763465639212199675966985523700, .8338762214983713355048859934853420195440,
876 .18492233849401198964024217730184318497780, .8311688311688311688311688311688311688312,
877 .18816383241818296356839823602058459073300, .8284789644012944983818770226537216828479,
878 .19139485299962943898322009772527962923050, .8258064516129032258064516129032258064516,
879 .19461546769967164038916962454095482826240, .8231511254019292604501607717041800643087,
880 .19782574332991986754137769821682013571260, .8205128205128205128205128205128205128205,
881 .20102574606059073203390141770796617493040, .8178913738019169329073482428115015974441,
882 .20421554142869088876999228432396193966280, .8152866242038216560509554140127388535032,
883 .20739519434607056602715147164417430758480, .8126984126984126984126984126984126984127,
884 .21056476910734961416338251183333341032260, .8101265822784810126582278481012658227848,
885 .21372432939771812687723695489694364368910, .8075709779179810725552050473186119873817,
886 .21687393830061435506806333251006435602900, .8050314465408805031446540880503144654088,
887 .22001365830528207823135744547471404075630, .8025078369905956112852664576802507836991,
888 .22314355131420973710199007200571941211830, .8000000000000000000000000000000000000000,
889 .22626367865045338145790765338460914790630, .7975077881619937694704049844236760124611,
890 .22937410106484582006380890106811420992010, .7950310559006211180124223602484472049689,
891 .23247487874309405442296849741978803649550, .7925696594427244582043343653250773993808,
892 .23556607131276688371634975283086532726890, .7901234567901234567901234567901234567901,
893 .23864773785017498464178231643018079921600, .7876923076923076923076923076923076923077,
894 .24171993688714515924331749374687206000090, .7852760736196319018404907975460122699387,
895 .24478272641769091566565919038112042471760, .7828746177370030581039755351681957186544,
896 .24783616390458124145723672882013488560910, .7804878048780487804878048780487804878049,
897 .25088030628580937353433455427875742316250, .7781155015197568389057750759878419452888,
898 .25391520998096339667426946107298135757450, .7757575757575757575757575757575757575758,
899 .25694093089750041913887912414793390780680, .7734138972809667673716012084592145015106,
900 .25995752443692604627401010475296061486000, .7710843373493975903614457831325301204819,
901 .26296504550088134477547896494797896593800, .7687687687687687687687687687687687687688,
902 .26596354849713793599974565040611196309330, .7664670658682634730538922155688622754491,
903 .26895308734550393836570947314612567424780, .7641791044776119402985074626865671641791,
904 .27193371548364175804834985683555714786050, .7619047619047619047619047619047619047619,
905 .27490548587279922676529508862586226314300, .7596439169139465875370919881305637982196,
906 .27786845100345625159121709657483734190480, .7573964497041420118343195266272189349112,
907 .28082266290088775395616949026589281857030, .7551622418879056047197640117994100294985,
908 .28376817313064456316240580235898960381750, .7529411764705882352941176470588235294118,
909 .28670503280395426282112225635501090437180, .7507331378299120234604105571847507331378,
910 .28963329258304265634293983566749375313530, .7485380116959064327485380116959064327485,
911 .29255300268637740579436012922087684273730, .7463556851311953352769679300291545189504,
912 .29546421289383584252163927885703742504130, .7441860465116279069767441860465116279070,
913 .29836697255179722709783618483925238251680, .7420289855072463768115942028985507246377,
914 .30126133057816173455023545102449133992200, .7398843930635838150289017341040462427746,
915 .30414733546729666446850615102448500692850, .7377521613832853025936599423631123919308,
916 .30702503529491181888388950937951449304830, .7356321839080459770114942528735632183908,
917 .30989447772286465854207904158101882785550, .7335243553008595988538681948424068767908,
918 .31275571000389684739317885942000430077330, .7314285714285714285714285714285714285714,
919 .31560877898630329552176476681779604405180, .7293447293447293447293447293447293447293,
920 .31845373111853458869546784626436419785030, .7272727272727272727272727272727272727273,
921 .32129061245373424782201254856772720813750, .7252124645892351274787535410764872521246,
922 .32411946865421192853773391107097268104550, .7231638418079096045197740112994350282486,
923 .32694034499585328257253991068864706903700, .7211267605633802816901408450704225352113,
924 .32975328637246797969240219572384376078850, .7191011235955056179775280898876404494382,
925 .33255833730007655635318997155991382896900, .7170868347338935574229691876750700280112,
926 .33535554192113781191153520921943709254280, .7150837988826815642458100558659217877095,
927 .33814494400871636381467055798566434532400, .7130919220055710306406685236768802228412,
928 .34092658697059319283795275623560883104800, .7111111111111111111111111111111111111111,
929 .34370051385331840121395430287520866841080, .7091412742382271468144044321329639889197,
930 .34646676734620857063262633346312213689100, .7071823204419889502762430939226519337017,
931 .34922538978528827602332285096053965389730, .7052341597796143250688705234159779614325,
932 .35197642315717814209818925519357435405250, .7032967032967032967032967032967032967033,
933 .35471990910292899856770532096561510115850, .7013698630136986301369863013698630136986,
934 .35745588892180374385176833129662554711100, .6994535519125683060109289617486338797814,
935 .36018440357500774995358483465679455548530, .6975476839237057220708446866485013623978,
936 .36290549368936841911903457003063522279280, .6956521739130434782608695652173913043478,
937 .36561919956096466943762379742111079394830, .6937669376693766937669376693766937669377,
938 .36832556115870762614150635272380895912650, .6918918918918918918918918918918918918919,
939 .37102461812787262962487488948681857436900, .6900269541778975741239892183288409703504,
940 .37371640979358405898480555151763837784530, .6881720430107526881720430107526881720430,
941 .37640097516425302659470730759494472295050, .6863270777479892761394101876675603217158,
942 .37907835293496944251145919224654790014030, .6844919786096256684491978609625668449198,
943 .38174858149084833769393299007788300514230, .6826666666666666666666666666666666666667,
944 .38441169891033200034513583887019194662580, .6808510638297872340425531914893617021277,
945 .38706774296844825844488013899535872042180, .6790450928381962864721485411140583554377,
946 .38971675114002518602873692543653305619950, .6772486772486772486772486772486772486772,
947 .39235876060286384303665840889152605086580, .6754617414248021108179419525065963060686,
948 .39499380824086893770896722344332374632350, .6736842105263157894736842105263157894737,
949 .39762193064713846624158577469643205404280, .6719160104986876640419947506561679790026,
950 .40024316412701266276741307592601515352730, .6701570680628272251308900523560209424084,
951 .40285754470108348090917615991202183067800, .6684073107049608355091383812010443864230,
952 .40546510810816432934799991016916465014230, .6666666666666666666666666666666666666667,
953 .40806588980822172674223224930756259709600, .6649350649350649350649350649350649350649,
954 .41065992498526837639616360320360399782650, .6632124352331606217616580310880829015544,
955 .41324724855021932601317757871584035456180, .6614987080103359173126614987080103359173,
956 .41582789514371093497757669865677598863850, .6597938144329896907216494845360824742268,
957 .41840189913888381489925905043492093682300, .6580976863753213367609254498714652956298,
958 .42096929464412963239894338585145305842150, .6564102564102564102564102564102564102564,
959 .42353011550580327293502591601281892508280, .6547314578005115089514066496163682864450,
960 .42608439531090003260516141381231136620050, .6530612244897959183673469387755102040816,
961 .42863216738969872610098832410585600882780, .6513994910941475826972010178117048346056,
962 .43117346481837132143866142541810404509300, .6497461928934010152284263959390862944162,
963 .43370832042155937902094819946796633303180, .6481012658227848101265822784810126582278,
964 .43623676677491801667585491486534010618930, .6464646464646464646464646464646464646465,
965 .43875883620762790027214350629947148263450, .6448362720403022670025188916876574307305,
966 .44127456080487520440058801796112675219780, .6432160804020100502512562814070351758794,
967 .44378397241030093089975139264424797147500, .6416040100250626566416040100250626566416,
968 .44628710262841947420398014401143882423650, .6400000000000000000000000000000000000000,
969 .44878398282700665555822183705458883196130, .6384039900249376558603491271820448877805,
970 .45127464413945855836729492693848442286250, .6368159203980099502487562189054726368159,
971 .45375911746712049854579618113348260521900, .6352357320099255583126550868486352357320,
972 .45623743348158757315857769754074979573500, .6336633663366336633663366336633663366337,
973 .45870962262697662081833982483658473938700, .6320987654320987654320987654320987654321,
974 .46117571512217014895185229761409573256980, .6305418719211822660098522167487684729064,
975 .46363574096303250549055974261136725544930, .6289926289926289926289926289926289926290,
976 .46608972992459918316399125615134835243230, .6274509803921568627450980392156862745098,
977 .46853771156323925639597405279346276074650, .6259168704156479217603911980440097799511,
978 .47097971521879100631480241645476780831830, .6243902439024390243902439024390243902439,
979 .47341577001667212165614273544633761048330, .6228710462287104622871046228710462287105,
980 .47584590486996386493601107758877333253630, .6213592233009708737864077669902912621359,
981 .47827014848147025860569669930555392056700, .6198547215496368038740920096852300242131,
982 .48068852934575190261057286988943815231330, .6183574879227053140096618357487922705314,
983 .48310107575113581113157579238759353756900, .6168674698795180722891566265060240963855,
984 .48550781578170076890899053978500887751580, .6153846153846153846153846153846153846154,
985 .48790877731923892879351001283794175833480, .6139088729016786570743405275779376498801,
986 .49030398804519381705802061333088204264650, .6124401913875598086124401913875598086124,
987 .49269347544257524607047571407747454941280, .6109785202863961813842482100238663484487,
988 .49507726679785146739476431321236304938800, .6095238095238095238095238095238095238095,
989 .49745538920281889838648226032091770321130, .6080760095011876484560570071258907363420,
990 .49982786955644931126130359189119189977650, .6066350710900473933649289099526066350711,
991 .50219473456671548383667413872899487614650, .6052009456264775413711583924349881796690,
992 .50455601075239520092452494282042607665050, .6037735849056603773584905660377358490566,
993 .50691172444485432801997148999362252652650, .6023529411764705882352941176470588235294,
994 .50926190178980790257412536448100581765150, .6009389671361502347417840375586854460094,
995 .51160656874906207391973111953120678663250, .5995316159250585480093676814988290398126,
996 .51394575110223428282552049495279788970950, .5981308411214953271028037383177570093458,
997 .51627947444845445623684554448118433356300, .5967365967365967365967365967365967365967,
998 .51860776420804555186805373523384332656850, .5953488372093023255813953488372093023256,
999 .52093064562418522900344441950437612831600, .5939675174013921113689095127610208816705,
1000 .52324814376454775732838697877014055848100, .5925925925925925925925925925925925925926,
1001 .52556028352292727401362526507000438869000, .5912240184757505773672055427251732101617,
1002 .52786708962084227803046587723656557500350, .5898617511520737327188940092165898617512,
1003 .53016858660912158374145519701414741575700, .5885057471264367816091954022988505747126,
1004 .53246479886947173376654518506256863474850, .5871559633027522935779816513761467889908,
1005 .53475575061602764748158733709715306758900, .5858123569794050343249427917620137299771,
1006 .53704146589688361856929077475797384977350, .5844748858447488584474885844748858447489,
1007 .53932196859560876944783558428753167390800, .5831435079726651480637813211845102505695,
1008 .54159728243274429804188230264117009937750, .5818181818181818181818181818181818181818,
1009 .54386743096728351609669971367111429572100, .5804988662131519274376417233560090702948,
1010 .54613243759813556721383065450936555862450, .5791855203619909502262443438914027149321,
1011 .54839232556557315767520321969641372561450, .5778781038374717832957110609480812641084,
1012 .55064711795266219063194057525834068655950, .5765765765765765765765765765765765765766,
1013 .55289683768667763352766542084282264113450, .5752808988764044943820224719101123595506,
1014 .55514150754050151093110798683483153581600, .5739910313901345291479820627802690582960,
1015 .55738115013400635344709144192165695130850, .5727069351230425055928411633109619686801,
1016 .55961578793542265941596269840374588966350, .5714285714285714285714285714285714285714,
1017 .56184544326269181269140062795486301183700, .5701559020044543429844097995545657015590,
1018 .56407013828480290218436721261241473257550, .5688888888888888888888888888888888888889,
1019 .56628989502311577464155334382667206227800, .5676274944567627494456762749445676274945,
1020 .56850473535266865532378233183408156037350, .5663716814159292035398230088495575221239,
1021 .57071468100347144680739575051120482385150, .5651214128035320088300220750551876379691,
1022 .57291975356178548306473885531886480748650, .5638766519823788546255506607929515418502,
1023 .57511997447138785144460371157038025558000, .5626373626373626373626373626373626373626,
1024 .57731536503482350219940144597785547375700, .5614035087719298245614035087719298245614,
1025 .57950594641464214795689713355386629700650, .5601750547045951859956236323851203501094,
1026 .58169173963462239562716149521293118596100, .5589519650655021834061135371179039301310,
1027 .58387276558098266665552955601015128195300, .5577342047930283224400871459694989106754,
1028 .58604904500357812846544902640744112432000, .5565217391304347826086956521739130434783,
1029 .58822059851708596855957011939608491957200, .5553145336225596529284164859002169197397,
1030 .59038744660217634674381770309992134571100, .5541125541125541125541125541125541125541,
1031 .59254960960667157898740242671919986605650, .5529157667386609071274298056155507559395,
1032 .59470710774669277576265358220553025603300, .5517241379310344827586206896551724137931,
1033 .59685996110779382384237123915227130055450, .5505376344086021505376344086021505376344,
1034 .59900818964608337768851242799428291618800, .5493562231759656652360515021459227467811,
1035 .60115181318933474940990890900138765573500, .5481798715203426124197002141327623126338,
1036 .60329085143808425240052883964381180703650, .5470085470085470085470085470085470085470,
1037 .60542532396671688843525771517306566238400, .5458422174840085287846481876332622601279,
1038 .60755525022454170969155029524699784815300, .5446808510638297872340425531914893617021,
1039 .60968064953685519036241657886421307921400, .5435244161358811040339702760084925690021,
1040 .61180154110599282990534675263916142284850, .5423728813559322033898305084745762711864,
1041 .61391794401237043121710712512140162289150, .5412262156448202959830866807610993657505,
1042 .61602987721551394351138242200249806046500, .5400843881856540084388185654008438818565,
1043 .61813735955507864705538167982012964785100, .5389473684210526315789473684210526315789,
1044 .62024040975185745772080281312810257077200, .5378151260504201680672268907563025210084,
1045 .62233904640877868441606324267922900617100, .5366876310272536687631027253668763102725,
1046 .62443328801189346144440150965237990021700, .5355648535564853556485355648535564853556,
1047 .62652315293135274476554741340805776417250, .5344467640918580375782881002087682672234,
1048 .62860865942237409420556559780379757285100, .5333333333333333333333333333333333333333,
1049 .63068982562619868570408243613201193511500, .5322245322245322245322245322245322245322,
1050 .63276666957103777644277897707070223987100, .5311203319502074688796680497925311203320,
1051 .63483920917301017716738442686619237065300, .5300207039337474120082815734989648033126,
1052 .63690746223706917739093569252872839570050, .5289256198347107438016528925619834710744,
1053 .63897144645792069983514238629140891134750, .5278350515463917525773195876288659793814,
1054 .64103117942093124081992527862894348800200, .5267489711934156378600823045267489711934,
1055 .64308667860302726193566513757104985415950, .5256673511293634496919917864476386036961,
1056 .64513796137358470073053240412264131009600, .5245901639344262295081967213114754098361,
1057 .64718504499530948859131740391603671014300, .5235173824130879345603271983640081799591,
1058 .64922794662510974195157587018911726772800, .5224489795918367346938775510204081632653,
1059 .65126668331495807251485530287027359008800, .5213849287169042769857433808553971486762,
1060 .65330127201274557080523663898929953575150, .5203252032520325203252032520325203252033,
1061 .65533172956312757406749369692988693714150, .5192697768762677484787018255578093306288,
1062 .65735807270835999727154330685152672231200, .5182186234817813765182186234817813765182,
1063 .65938031808912778153342060249997302889800, .5171717171717171717171717171717171717172,
1064 .66139848224536490484126716182800009846700, .5161290322580645161290322580645161290323,
1065 .66341258161706617713093692145776003599150, .5150905432595573440643863179074446680080,
1066 .66542263254509037562201001492212526500250, .5140562248995983935742971887550200803213,
1067 .66742865127195616370414654738851822912700, .5130260521042084168336673346693386773547,
1068 .66943065394262923906154583164607174694550, .5120000000000000000000000000000000000000,
1069 .67142865660530226534774556057527661323550, .5109780439121756487025948103792415169661,
1070 .67342267521216669923234121597488410770900, .5099601593625498007968127490039840637450,
1071 .67541272562017662384192817626171745359900, .5089463220675944333996023856858846918489,
1072 .67739882359180603188519853574689477682100, .5079365079365079365079365079365079365079,
1073 .67938098479579733801614338517538271844400, .5069306930693069306930693069306930693069,
1074 .68135922480790300781450241629499942064300, .5059288537549407114624505928853754940711,
1075 .68333355911162063645036823800182901322850, .5049309664694280078895463510848126232742,
1076 .68530400309891936760919861626462079584600, .5039370078740157480314960629921259842520,
1077 .68727057207096020619019327568821609020250, .5029469548133595284872298624754420432220,
1078 .68923328123880889251040571252815425395950, .5019607843137254901960784313725490196078,
1079 .69314718055994530941723212145818, 5.0e-01,
1080 };
1081
1082
1083
1084 #define LOGTAB_TRANSLATE(x,h) (((x) - 1.)*icvLogTab[(h)+1])
1085 static const double ln_2 = 0.69314718055994530941723212145818;
1086
log(const float * _x,float * y,int n)1087 void log( const float *_x, float *y, int n )
1088 {
1089 static const float shift[] = { 0, -1.f/512 };
1090 static const float
1091 A0 = 0.3333333333333333333333333f,
1092 A1 = -0.5f,
1093 A2 = 1.f;
1094
1095 #undef LOGPOLY
1096 #define LOGPOLY(x) (((A0*(x) + A1)*(x) + A2)*(x))
1097
1098 int i = 0;
1099 Cv32suf buf[4];
1100 const int* x = (const int*)_x;
1101
1102 #if CV_SSE2
1103 static const __m128d ln2_2 = _mm_set1_pd(ln_2);
1104 static const __m128 _1_4 = _mm_set1_ps(1.f);
1105 static const __m128 shift4 = _mm_set1_ps(-1.f/512);
1106
1107 static const __m128 mA0 = _mm_set1_ps(A0);
1108 static const __m128 mA1 = _mm_set1_ps(A1);
1109 static const __m128 mA2 = _mm_set1_ps(A2);
1110
1111 int CV_DECL_ALIGNED(16) idx[4];
1112
1113 for( ; i <= n - 4; i += 4 )
1114 {
1115 __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i));
1116 __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 23), _mm_set1_epi32(255)), _mm_set1_epi32(127));
1117 __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2);
1118 __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0,yi0)), ln2_2);
1119
1120 __m128i xi0 = _mm_or_si128(_mm_and_si128(h0, _mm_set1_epi32(LOGTAB_MASK2_32F)), _mm_set1_epi32(127 << 23));
1121
1122 h0 = _mm_and_si128(_mm_srli_epi32(h0, 23 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK*2));
1123 _mm_store_si128((__m128i*)idx, h0);
1124 h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510));
1125
1126 __m128d t0, t1, t2, t3, t4;
1127 t0 = _mm_load_pd(icvLogTab + idx[0]);
1128 t2 = _mm_load_pd(icvLogTab + idx[1]);
1129 t1 = _mm_unpackhi_pd(t0, t2);
1130 t0 = _mm_unpacklo_pd(t0, t2);
1131 t2 = _mm_load_pd(icvLogTab + idx[2]);
1132 t4 = _mm_load_pd(icvLogTab + idx[3]);
1133 t3 = _mm_unpackhi_pd(t2, t4);
1134 t2 = _mm_unpacklo_pd(t2, t4);
1135
1136 yd0 = _mm_add_pd(yd0, t0);
1137 yd1 = _mm_add_pd(yd1, t2);
1138
1139 __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1));
1140
1141 __m128 xf0 = _mm_sub_ps(_mm_castsi128_ps(xi0), _1_4);
1142 xf0 = _mm_mul_ps(xf0, _mm_movelh_ps(_mm_cvtpd_ps(t1), _mm_cvtpd_ps(t3)));
1143 xf0 = _mm_add_ps(xf0, _mm_and_ps(_mm_castsi128_ps(h0), shift4));
1144
1145 __m128 zf0 = _mm_mul_ps(xf0, mA0);
1146 zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA1), xf0);
1147 zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA2), xf0);
1148 yf0 = _mm_add_ps(yf0, zf0);
1149
1150 _mm_storeu_ps(y + i, yf0);
1151 }
1152 #endif
1153 for( ; i <= n - 4; i += 4 )
1154 {
1155 double x0, x1, x2, x3;
1156 double y0, y1, y2, y3;
1157 int h0, h1, h2, h3;
1158
1159 h0 = x[i];
1160 h1 = x[i+1];
1161 buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23);
1162 buf[1].i = (h1 & LOGTAB_MASK2_32F) | (127 << 23);
1163
1164 y0 = (((h0 >> 23) & 0xff) - 127) * ln_2;
1165 y1 = (((h1 >> 23) & 0xff) - 127) * ln_2;
1166
1167 h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
1168 h1 = (h1 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
1169
1170 y0 += icvLogTab[h0];
1171 y1 += icvLogTab[h1];
1172
1173 h2 = x[i+2];
1174 h3 = x[i+3];
1175
1176 x0 = LOGTAB_TRANSLATE( buf[0].f, h0 );
1177 x1 = LOGTAB_TRANSLATE( buf[1].f, h1 );
1178
1179 buf[2].i = (h2 & LOGTAB_MASK2_32F) | (127 << 23);
1180 buf[3].i = (h3 & LOGTAB_MASK2_32F) | (127 << 23);
1181
1182 y2 = (((h2 >> 23) & 0xff) - 127) * ln_2;
1183 y3 = (((h3 >> 23) & 0xff) - 127) * ln_2;
1184
1185 h2 = (h2 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
1186 h3 = (h3 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
1187
1188 y2 += icvLogTab[h2];
1189 y3 += icvLogTab[h3];
1190
1191 x2 = LOGTAB_TRANSLATE( buf[2].f, h2 );
1192 x3 = LOGTAB_TRANSLATE( buf[3].f, h3 );
1193
1194 x0 += shift[h0 == 510];
1195 x1 += shift[h1 == 510];
1196 y0 += LOGPOLY( x0 );
1197 y1 += LOGPOLY( x1 );
1198
1199 y[i] = (float) y0;
1200 y[i + 1] = (float) y1;
1201
1202 x2 += shift[h2 == 510];
1203 x3 += shift[h3 == 510];
1204 y2 += LOGPOLY( x2 );
1205 y3 += LOGPOLY( x3 );
1206
1207 y[i + 2] = (float) y2;
1208 y[i + 3] = (float) y3;
1209 }
1210
1211 for( ; i < n; i++ )
1212 {
1213 int h0 = x[i];
1214 double y0;
1215 float x0;
1216
1217 y0 = (((h0 >> 23) & 0xff) - 127) * ln_2;
1218
1219 buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23);
1220 h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
1221
1222 y0 += icvLogTab[h0];
1223 x0 = (float)LOGTAB_TRANSLATE( buf[0].f, h0 );
1224 x0 += shift[h0 == 510];
1225 y0 += LOGPOLY( x0 );
1226
1227 y[i] = (float)y0;
1228 }
1229 }
1230
log(const double * x,double * y,int n)1231 void log( const double *x, double *y, int n )
1232 {
1233 static const double shift[] = { 0, -1./512 };
1234 static const double
1235 A7 = 1.0,
1236 A6 = -0.5,
1237 A5 = 0.333333333333333314829616256247390992939472198486328125,
1238 A4 = -0.25,
1239 A3 = 0.2,
1240 A2 = -0.1666666666666666574148081281236954964697360992431640625,
1241 A1 = 0.1428571428571428769682682968777953647077083587646484375,
1242 A0 = -0.125;
1243
1244 #undef LOGPOLY
1245 #define LOGPOLY(x,k) ((x)+=shift[k], xq = (x)*(x),\
1246 (((A0*xq + A2)*xq + A4)*xq + A6)*xq + \
1247 (((A1*xq + A3)*xq + A5)*xq + A7)*(x))
1248
1249 int i = 0;
1250 DBLINT buf[4];
1251 DBLINT *X = (DBLINT *) x;
1252
1253 #if CV_SSE2
1254 static const __m128d ln2_2 = _mm_set1_pd(ln_2);
1255 static const __m128d _1_2 = _mm_set1_pd(1.);
1256 static const __m128d shift2 = _mm_set1_pd(-1./512);
1257
1258 static const __m128i log_and_mask2 = _mm_set_epi32(LOGTAB_MASK2, 0xffffffff, LOGTAB_MASK2, 0xffffffff);
1259 static const __m128i log_or_mask2 = _mm_set_epi32(1023 << 20, 0, 1023 << 20, 0);
1260
1261 static const __m128d mA0 = _mm_set1_pd(A0);
1262 static const __m128d mA1 = _mm_set1_pd(A1);
1263 static const __m128d mA2 = _mm_set1_pd(A2);
1264 static const __m128d mA3 = _mm_set1_pd(A3);
1265 static const __m128d mA4 = _mm_set1_pd(A4);
1266 static const __m128d mA5 = _mm_set1_pd(A5);
1267 static const __m128d mA6 = _mm_set1_pd(A6);
1268 static const __m128d mA7 = _mm_set1_pd(A7);
1269
1270 int CV_DECL_ALIGNED(16) idx[4];
1271
1272 for( ; i <= n - 4; i += 4 )
1273 {
1274 __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i));
1275 __m128i h1 = _mm_loadu_si128((const __m128i*)(x + i + 2));
1276
1277 __m128d xd0 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h0, log_and_mask2), log_or_mask2));
1278 __m128d xd1 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h1, log_and_mask2), log_or_mask2));
1279
1280 h0 = _mm_unpackhi_epi32(_mm_unpacklo_epi32(h0, h1), _mm_unpackhi_epi32(h0, h1));
1281
1282 __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 20),
1283 _mm_set1_epi32(2047)), _mm_set1_epi32(1023));
1284 __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2);
1285 __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0, yi0)), ln2_2);
1286
1287 h0 = _mm_and_si128(_mm_srli_epi32(h0, 20 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK * 2));
1288 _mm_store_si128((__m128i*)idx, h0);
1289 h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510));
1290
1291 __m128d t0, t1, t2, t3, t4;
1292 t0 = _mm_load_pd(icvLogTab + idx[0]);
1293 t2 = _mm_load_pd(icvLogTab + idx[1]);
1294 t1 = _mm_unpackhi_pd(t0, t2);
1295 t0 = _mm_unpacklo_pd(t0, t2);
1296 t2 = _mm_load_pd(icvLogTab + idx[2]);
1297 t4 = _mm_load_pd(icvLogTab + idx[3]);
1298 t3 = _mm_unpackhi_pd(t2, t4);
1299 t2 = _mm_unpacklo_pd(t2, t4);
1300
1301 yd0 = _mm_add_pd(yd0, t0);
1302 yd1 = _mm_add_pd(yd1, t2);
1303
1304 xd0 = _mm_mul_pd(_mm_sub_pd(xd0, _1_2), t1);
1305 xd1 = _mm_mul_pd(_mm_sub_pd(xd1, _1_2), t3);
1306
1307 xd0 = _mm_add_pd(xd0, _mm_and_pd(_mm_castsi128_pd(_mm_unpacklo_epi32(h0, h0)), shift2));
1308 xd1 = _mm_add_pd(xd1, _mm_and_pd(_mm_castsi128_pd(_mm_unpackhi_epi32(h0, h0)), shift2));
1309
1310 __m128d zd0 = _mm_mul_pd(xd0, mA0);
1311 __m128d zd1 = _mm_mul_pd(xd1, mA0);
1312 zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA1), xd0);
1313 zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA1), xd1);
1314 zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA2), xd0);
1315 zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA2), xd1);
1316 zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA3), xd0);
1317 zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA3), xd1);
1318 zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA4), xd0);
1319 zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA4), xd1);
1320 zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA5), xd0);
1321 zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA5), xd1);
1322 zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA6), xd0);
1323 zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA6), xd1);
1324 zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA7), xd0);
1325 zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA7), xd1);
1326
1327 yd0 = _mm_add_pd(yd0, zd0);
1328 yd1 = _mm_add_pd(yd1, zd1);
1329
1330 _mm_storeu_pd(y + i, yd0);
1331 _mm_storeu_pd(y + i + 2, yd1);
1332 }
1333 #endif
1334 for( ; i <= n - 4; i += 4 )
1335 {
1336 double xq;
1337 double x0, x1, x2, x3;
1338 double y0, y1, y2, y3;
1339 int h0, h1, h2, h3;
1340
1341 h0 = X[i].i.lo;
1342 h1 = X[i + 1].i.lo;
1343 buf[0].i.lo = h0;
1344 buf[1].i.lo = h1;
1345
1346 h0 = X[i].i.hi;
1347 h1 = X[i + 1].i.hi;
1348 buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20);
1349 buf[1].i.hi = (h1 & LOGTAB_MASK2) | (1023 << 20);
1350
1351 y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2;
1352 y1 = (((h1 >> 20) & 0x7ff) - 1023) * ln_2;
1353
1354 h2 = X[i + 2].i.lo;
1355 h3 = X[i + 3].i.lo;
1356 buf[2].i.lo = h2;
1357 buf[3].i.lo = h3;
1358
1359 h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
1360 h1 = (h1 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
1361
1362 y0 += icvLogTab[h0];
1363 y1 += icvLogTab[h1];
1364
1365 h2 = X[i + 2].i.hi;
1366 h3 = X[i + 3].i.hi;
1367
1368 x0 = LOGTAB_TRANSLATE( buf[0].d, h0 );
1369 x1 = LOGTAB_TRANSLATE( buf[1].d, h1 );
1370
1371 buf[2].i.hi = (h2 & LOGTAB_MASK2) | (1023 << 20);
1372 buf[3].i.hi = (h3 & LOGTAB_MASK2) | (1023 << 20);
1373
1374 y2 = (((h2 >> 20) & 0x7ff) - 1023) * ln_2;
1375 y3 = (((h3 >> 20) & 0x7ff) - 1023) * ln_2;
1376
1377 h2 = (h2 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
1378 h3 = (h3 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
1379
1380 y2 += icvLogTab[h2];
1381 y3 += icvLogTab[h3];
1382
1383 x2 = LOGTAB_TRANSLATE( buf[2].d, h2 );
1384 x3 = LOGTAB_TRANSLATE( buf[3].d, h3 );
1385
1386 y0 += LOGPOLY( x0, h0 == 510 );
1387 y1 += LOGPOLY( x1, h1 == 510 );
1388
1389 y[i] = y0;
1390 y[i + 1] = y1;
1391
1392 y2 += LOGPOLY( x2, h2 == 510 );
1393 y3 += LOGPOLY( x3, h3 == 510 );
1394
1395 y[i + 2] = y2;
1396 y[i + 3] = y3;
1397 }
1398
1399 for( ; i < n; i++ )
1400 {
1401 int h0 = X[i].i.hi;
1402 double xq;
1403 double x0, y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2;
1404
1405 buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20);
1406 buf[0].i.lo = X[i].i.lo;
1407 h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
1408
1409 y0 += icvLogTab[h0];
1410 x0 = LOGTAB_TRANSLATE( buf[0].d, h0 );
1411 y0 += LOGPOLY( x0, h0 == 510 );
1412 y[i] = y0;
1413 }
1414 }
1415
1416 }}
1417