1/*M///////////////////////////////////////////////////////////////////////////////////////
2//
3//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4//
5//  By downloading, copying, installing or using the software you agree to this license.
6//  If you do not agree to this license, do not download, install,
7//  copy or use the software.
8//
9//
10//                           License Agreement
11//                For Open Source Computer Vision Library
12//
13// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
15// Third party copyrights are property of their respective owners.
16//
17// Redistribution and use in source and binary forms, with or without modification,
18// are permitted provided that the following conditions are met:
19//
20//   * Redistribution's of source code must retain the above copyright notice,
21//     this list of conditions and the following disclaimer.
22//
23//   * Redistribution's in binary form must reproduce the above copyright notice,
24//     this list of conditions and the following disclaimer in the documentation
25//     and/or other materials provided with the distribution.
26//
27//   * The name of the copyright holders may not be used to endorse or promote products
28//     derived from this software without specific prior written permission.
29//
30// This software is provided by the copyright holders and contributors "as is" and
31// any express or implied warranties, including, but not limited to, the implied
32// warranties of merchantability and fitness for a particular purpose are disclaimed.
33// In no event shall the Intel Corporation or contributors be liable for any direct,
34// indirect, incidental, special, exemplary, or consequential damages
35// (including, but not limited to, procurement of substitute goods or services;
36// loss of use, data, or profits; or business interruption) however caused
37// and on any theory of liability, whether in contract, strict liability,
38// or tort (including negligence or otherwise) arising in any way out of
39// the use of this software, even if advised of the possibility of such damage.
40//
41//M*/
42
43#include "precomp.hpp"
44
45#undef HAVE_IPP
46
47namespace cv { namespace hal {
48
49///////////////////////////////////// ATAN2 ////////////////////////////////////
50static const float atan2_p1 = 0.9997878412794807f*(float)(180/CV_PI);
51static const float atan2_p3 = -0.3258083974640975f*(float)(180/CV_PI);
52static const float atan2_p5 = 0.1555786518463281f*(float)(180/CV_PI);
53static const float atan2_p7 = -0.04432655554792128f*(float)(180/CV_PI);
54
55#if CV_NEON
56static inline float32x4_t cv_vrecpq_f32(float32x4_t val)
57{
58    float32x4_t reciprocal = vrecpeq_f32(val);
59    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
60    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
61    return reciprocal;
62}
63#endif
64
65void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
66{
67    int i = 0;
68    float scale = angleInDegrees ? 1 : (float)(CV_PI/180);
69
70#ifdef HAVE_TEGRA_OPTIMIZATION
71    if (tegra::useTegra() && tegra::FastAtan2_32f(Y, X, angle, len, scale))
72        return;
73#endif
74
75#if CV_SSE2
76    Cv32suf iabsmask; iabsmask.i = 0x7fffffff;
77    __m128 eps = _mm_set1_ps((float)DBL_EPSILON), absmask = _mm_set1_ps(iabsmask.f);
78    __m128 _90 = _mm_set1_ps(90.f), _180 = _mm_set1_ps(180.f), _360 = _mm_set1_ps(360.f);
79    __m128 z = _mm_setzero_ps(), scale4 = _mm_set1_ps(scale);
80    __m128 p1 = _mm_set1_ps(atan2_p1), p3 = _mm_set1_ps(atan2_p3);
81    __m128 p5 = _mm_set1_ps(atan2_p5), p7 = _mm_set1_ps(atan2_p7);
82
83    for( ; i <= len - 4; i += 4 )
84    {
85        __m128 x = _mm_loadu_ps(X + i), y = _mm_loadu_ps(Y + i);
86        __m128 ax = _mm_and_ps(x, absmask), ay = _mm_and_ps(y, absmask);
87        __m128 mask = _mm_cmplt_ps(ax, ay);
88        __m128 tmin = _mm_min_ps(ax, ay), tmax = _mm_max_ps(ax, ay);
89        __m128 c = _mm_div_ps(tmin, _mm_add_ps(tmax, eps));
90        __m128 c2 = _mm_mul_ps(c, c);
91        __m128 a = _mm_mul_ps(c2, p7);
92        a = _mm_mul_ps(_mm_add_ps(a, p5), c2);
93        a = _mm_mul_ps(_mm_add_ps(a, p3), c2);
94        a = _mm_mul_ps(_mm_add_ps(a, p1), c);
95
96        __m128 b = _mm_sub_ps(_90, a);
97        a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
98
99        b = _mm_sub_ps(_180, a);
100        mask = _mm_cmplt_ps(x, z);
101        a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
102
103        b = _mm_sub_ps(_360, a);
104        mask = _mm_cmplt_ps(y, z);
105        a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
106
107        a = _mm_mul_ps(a, scale4);
108        _mm_storeu_ps(angle + i, a);
109    }
110#elif CV_NEON
111    float32x4_t eps = vdupq_n_f32((float)DBL_EPSILON);
112    float32x4_t _90 = vdupq_n_f32(90.f), _180 = vdupq_n_f32(180.f), _360 = vdupq_n_f32(360.f);
113    float32x4_t z = vdupq_n_f32(0.0f), scale4 = vdupq_n_f32(scale);
114    float32x4_t p1 = vdupq_n_f32(atan2_p1), p3 = vdupq_n_f32(atan2_p3);
115    float32x4_t p5 = vdupq_n_f32(atan2_p5), p7 = vdupq_n_f32(atan2_p7);
116
117    for( ; i <= len - 4; i += 4 )
118    {
119        float32x4_t x = vld1q_f32(X + i), y = vld1q_f32(Y + i);
120        float32x4_t ax = vabsq_f32(x), ay = vabsq_f32(y);
121        float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay);
122        float32x4_t c = vmulq_f32(tmin, cv_vrecpq_f32(vaddq_f32(tmax, eps)));
123        float32x4_t c2 = vmulq_f32(c, c);
124        float32x4_t a = vmulq_f32(c2, p7);
125        a = vmulq_f32(vaddq_f32(a, p5), c2);
126        a = vmulq_f32(vaddq_f32(a, p3), c2);
127        a = vmulq_f32(vaddq_f32(a, p1), c);
128
129        a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a));
130        a = vbslq_f32(vcltq_f32(x, z), vsubq_f32(_180, a), a);
131        a = vbslq_f32(vcltq_f32(y, z), vsubq_f32(_360, a), a);
132
133        vst1q_f32(angle + i, vmulq_f32(a, scale4));
134    }
135#endif
136
137    for( ; i < len; i++ )
138    {
139        float x = X[i], y = Y[i];
140        float ax = std::abs(x), ay = std::abs(y);
141        float a, c, c2;
142        if( ax >= ay )
143        {
144            c = ay/(ax + (float)DBL_EPSILON);
145            c2 = c*c;
146            a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
147        }
148        else
149        {
150            c = ax/(ay + (float)DBL_EPSILON);
151            c2 = c*c;
152            a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
153        }
154        if( x < 0 )
155            a = 180.f - a;
156        if( y < 0 )
157            a = 360.f - a;
158        angle[i] = (float)(a*scale);
159    }
160}
161
162
163void magnitude(const float* x, const float* y, float* mag, int len)
164{
165#if defined HAVE_IPP
166    CV_IPP_CHECK()
167    {
168        IppStatus status = ippsMagnitude_32f(x, y, mag, len);
169        if (status >= 0)
170        {
171            CV_IMPL_ADD(CV_IMPL_IPP);
172            return;
173        }
174        setIppErrorStatus();
175    }
176#endif
177
178    int i = 0;
179
180#if CV_SIMD128
181    for( ; i <= len - 8; i += 8 )
182    {
183        v_float32x4 x0 = v_load(x + i), x1 = v_load(x + i + 4);
184        v_float32x4 y0 = v_load(y + i), y1 = v_load(y + i + 4);
185        x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
186        x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
187        v_store(mag + i, x0);
188        v_store(mag + i + 4, x1);
189    }
190#endif
191
192    for( ; i < len; i++ )
193    {
194        float x0 = x[i], y0 = y[i];
195        mag[i] = std::sqrt(x0*x0 + y0*y0);
196    }
197}
198
199void magnitude(const double* x, const double* y, double* mag, int len)
200{
201#if defined(HAVE_IPP)
202    CV_IPP_CHECK()
203    {
204        IppStatus status = ippsMagnitude_64f(x, y, mag, len);
205        if (status >= 0)
206        {
207            CV_IMPL_ADD(CV_IMPL_IPP);
208            return;
209        }
210        setIppErrorStatus();
211    }
212#endif
213
214    int i = 0;
215
216#if CV_SIMD128_64F
217    for( ; i <= len - 4; i += 4 )
218    {
219        v_float64x2 x0 = v_load(x + i), x1 = v_load(x + i + 2);
220        v_float64x2 y0 = v_load(y + i), y1 = v_load(y + i + 2);
221        x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
222        x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
223        v_store(mag + i, x0);
224        v_store(mag + i + 2, x1);
225    }
226#endif
227
228    for( ; i < len; i++ )
229    {
230        double x0 = x[i], y0 = y[i];
231        mag[i] = std::sqrt(x0*x0 + y0*y0);
232    }
233}
234
235
236void invSqrt(const float* src, float* dst, int len)
237{
238#if defined(HAVE_IPP)
239    CV_IPP_CHECK()
240    {
241        if (ippsInvSqrt_32f_A21(src, dst, len) >= 0)
242        {
243            CV_IMPL_ADD(CV_IMPL_IPP);
244            return;
245        }
246        setIppErrorStatus();
247    }
248#endif
249
250    int i = 0;
251
252#if CV_SIMD128
253    for( ; i <= len - 8; i += 8 )
254    {
255        v_float32x4 t0 = v_load(src + i), t1 = v_load(src + i + 4);
256        t0 = v_invsqrt(t0);
257        t1 = v_invsqrt(t1);
258        v_store(dst + i, t0); v_store(dst + i + 4, t1);
259    }
260#endif
261
262    for( ; i < len; i++ )
263        dst[i] = 1/std::sqrt(src[i]);
264}
265
266
267void invSqrt(const double* src, double* dst, int len)
268{
269    int i = 0;
270
271#if CV_SSE2
272    __m128d v_1 = _mm_set1_pd(1.0);
273    for ( ; i <= len - 2; i += 2)
274        _mm_storeu_pd(dst + i, _mm_div_pd(v_1, _mm_sqrt_pd(_mm_loadu_pd(src + i))));
275#endif
276
277    for( ; i < len; i++ )
278        dst[i] = 1/std::sqrt(src[i]);
279}
280
281
282void sqrt(const float* src, float* dst, int len)
283{
284#if defined(HAVE_IPP)
285    CV_IPP_CHECK()
286    {
287        if (ippsSqrt_32f_A21(src, dst, len) >= 0)
288        {
289            CV_IMPL_ADD(CV_IMPL_IPP);
290            return;
291        }
292        setIppErrorStatus();
293    }
294#endif
295
296    int i = 0;
297
298#if CV_SIMD128
299    for( ; i <= len - 8; i += 8 )
300    {
301        v_float32x4 t0 = v_load(src + i), t1 = v_load(src + i + 4);
302        t0 = v_sqrt(t0);
303        t1 = v_sqrt(t1);
304        v_store(dst + i, t0); v_store(dst + i + 4, t1);
305    }
306#endif
307
308    for( ; i < len; i++ )
309        dst[i] = std::sqrt(src[i]);
310}
311
312
313void sqrt(const double* src, double* dst, int len)
314{
315#if defined(HAVE_IPP)
316    CV_IPP_CHECK()
317    {
318        if (ippsSqrt_64f_A50(src, dst, len) >= 0)
319        {
320            CV_IMPL_ADD(CV_IMPL_IPP);
321            return;
322        }
323        setIppErrorStatus();
324    }
325#endif
326
327    int i = 0;
328
329#if CV_SIMD128_64F
330    for( ; i <= len - 4; i += 4 )
331    {
332        v_float64x2 t0 = v_load(src + i), t1 = v_load(src + i + 2);
333        t0 = v_sqrt(t0);
334        t1 = v_sqrt(t1);
335        v_store(dst + i, t0); v_store(dst + i + 2, t1);
336    }
337#endif
338
339    for( ; i < len; i++ )
340        dst[i] = std::sqrt(src[i]);
341}
342
343////////////////////////////////////// EXP /////////////////////////////////////
344
345typedef union
346{
347    struct {
348#if ( defined( WORDS_BIGENDIAN ) && !defined( OPENCV_UNIVERSAL_BUILD ) ) || defined( __BIG_ENDIAN__ )
349        int hi;
350        int lo;
351#else
352        int lo;
353        int hi;
354#endif
355    } i;
356    double d;
357}
358DBLINT;
359
360#define EXPTAB_SCALE 6
361#define EXPTAB_MASK  ((1 << EXPTAB_SCALE) - 1)
362
363#define EXPPOLY_32F_A0 .9670371139572337719125840413672004409288e-2
364
365static const double expTab[] = {
366    1.0 * EXPPOLY_32F_A0,
367    1.0108892860517004600204097905619 * EXPPOLY_32F_A0,
368    1.0218971486541166782344801347833 * EXPPOLY_32F_A0,
369    1.0330248790212284225001082839705 * EXPPOLY_32F_A0,
370    1.0442737824274138403219664787399 * EXPPOLY_32F_A0,
371    1.0556451783605571588083413251529 * EXPPOLY_32F_A0,
372    1.0671404006768236181695211209928 * EXPPOLY_32F_A0,
373    1.0787607977571197937406800374385 * EXPPOLY_32F_A0,
374    1.0905077326652576592070106557607 * EXPPOLY_32F_A0,
375    1.1023825833078409435564142094256 * EXPPOLY_32F_A0,
376    1.1143867425958925363088129569196 * EXPPOLY_32F_A0,
377    1.126521618608241899794798643787 * EXPPOLY_32F_A0,
378    1.1387886347566916537038302838415 * EXPPOLY_32F_A0,
379    1.151189229952982705817759635202 * EXPPOLY_32F_A0,
380    1.1637248587775775138135735990922 * EXPPOLY_32F_A0,
381    1.1763969916502812762846457284838 * EXPPOLY_32F_A0,
382    1.1892071150027210667174999705605 * EXPPOLY_32F_A0,
383    1.2021567314527031420963969574978 * EXPPOLY_32F_A0,
384    1.2152473599804688781165202513388 * EXPPOLY_32F_A0,
385    1.2284805361068700056940089577928 * EXPPOLY_32F_A0,
386    1.2418578120734840485936774687266 * EXPPOLY_32F_A0,
387    1.2553807570246910895793906574423 * EXPPOLY_32F_A0,
388    1.2690509571917332225544190810323 * EXPPOLY_32F_A0,
389    1.2828700160787782807266697810215 * EXPPOLY_32F_A0,
390    1.2968395546510096659337541177925 * EXPPOLY_32F_A0,
391    1.3109612115247643419229917863308 * EXPPOLY_32F_A0,
392    1.3252366431597412946295370954987 * EXPPOLY_32F_A0,
393    1.3396675240533030053600306697244 * EXPPOLY_32F_A0,
394    1.3542555469368927282980147401407 * EXPPOLY_32F_A0,
395    1.3690024229745906119296011329822 * EXPPOLY_32F_A0,
396    1.3839098819638319548726595272652 * EXPPOLY_32F_A0,
397    1.3989796725383111402095281367152 * EXPPOLY_32F_A0,
398    1.4142135623730950488016887242097 * EXPPOLY_32F_A0,
399    1.4296133383919700112350657782751 * EXPPOLY_32F_A0,
400    1.4451808069770466200370062414717 * EXPPOLY_32F_A0,
401    1.4609177941806469886513028903106 * EXPPOLY_32F_A0,
402    1.476826145939499311386907480374 * EXPPOLY_32F_A0,
403    1.4929077282912648492006435314867 * EXPPOLY_32F_A0,
404    1.5091644275934227397660195510332 * EXPPOLY_32F_A0,
405    1.5255981507445383068512536895169 * EXPPOLY_32F_A0,
406    1.5422108254079408236122918620907 * EXPPOLY_32F_A0,
407    1.5590044002378369670337280894749 * EXPPOLY_32F_A0,
408    1.5759808451078864864552701601819 * EXPPOLY_32F_A0,
409    1.5931421513422668979372486431191 * EXPPOLY_32F_A0,
410    1.6104903319492543081795206673574 * EXPPOLY_32F_A0,
411    1.628027421857347766848218522014 * EXPPOLY_32F_A0,
412    1.6457554781539648445187567247258 * EXPPOLY_32F_A0,
413    1.6636765803267364350463364569764 * EXPPOLY_32F_A0,
414    1.6817928305074290860622509524664 * EXPPOLY_32F_A0,
415    1.7001063537185234695013625734975 * EXPPOLY_32F_A0,
416    1.7186192981224779156293443764563 * EXPPOLY_32F_A0,
417    1.7373338352737062489942020818722 * EXPPOLY_32F_A0,
418    1.7562521603732994831121606193753 * EXPPOLY_32F_A0,
419    1.7753764925265212525505592001993 * EXPPOLY_32F_A0,
420    1.7947090750031071864277032421278 * EXPPOLY_32F_A0,
421    1.8142521755003987562498346003623 * EXPPOLY_32F_A0,
422    1.8340080864093424634870831895883 * EXPPOLY_32F_A0,
423    1.8539791250833855683924530703377 * EXPPOLY_32F_A0,
424    1.8741676341102999013299989499544 * EXPPOLY_32F_A0,
425    1.8945759815869656413402186534269 * EXPPOLY_32F_A0,
426    1.9152065613971472938726112702958 * EXPPOLY_32F_A0,
427    1.9360617934922944505980559045667 * EXPPOLY_32F_A0,
428    1.9571441241754002690183222516269 * EXPPOLY_32F_A0,
429    1.9784560263879509682582499181312 * EXPPOLY_32F_A0,
430};
431
432
433// the code below uses _mm_cast* intrinsics, which are not avialable on VS2005
434#if (defined _MSC_VER && _MSC_VER < 1500) || \
435(!defined __APPLE__ && defined __GNUC__ && __GNUC__*100 + __GNUC_MINOR__ < 402)
436#undef CV_SSE2
437#define CV_SSE2 0
438#endif
439
440static const double exp_prescale = 1.4426950408889634073599246810019 * (1 << EXPTAB_SCALE);
441static const double exp_postscale = 1./(1 << EXPTAB_SCALE);
442static const double exp_max_val = 3000.*(1 << EXPTAB_SCALE); // log10(DBL_MAX) < 3000
443
444void exp( const float *_x, float *y, int n )
445{
446    static const float
447    A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),
448    A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0),
449    A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0),
450    A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0);
451
452#undef EXPPOLY
453#define EXPPOLY(x)  \
454(((((x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)
455
456    int i = 0;
457    const Cv32suf* x = (const Cv32suf*)_x;
458    Cv32suf buf[4];
459
460#if CV_SSE2
461    if( n >= 8 )
462    {
463        static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
464        static const __m128 postscale4 = _mm_set1_ps((float)exp_postscale);
465        static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale));
466        static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale));
467
468        static const __m128 mA1 = _mm_set1_ps(A1);
469        static const __m128 mA2 = _mm_set1_ps(A2);
470        static const __m128 mA3 = _mm_set1_ps(A3);
471        static const __m128 mA4 = _mm_set1_ps(A4);
472        bool y_aligned = (size_t)(void*)y % 16 == 0;
473
474        ushort CV_DECL_ALIGNED(16) tab_idx[8];
475
476        for( ; i <= n - 8; i += 8 )
477        {
478            __m128 xf0, xf1;
479            xf0 = _mm_loadu_ps(&x[i].f);
480            xf1 = _mm_loadu_ps(&x[i+4].f);
481            __m128i xi0, xi1, xi2, xi3;
482
483            xf0 = _mm_min_ps(_mm_max_ps(xf0, minval4), maxval4);
484            xf1 = _mm_min_ps(_mm_max_ps(xf1, minval4), maxval4);
485
486            __m128d xd0 = _mm_cvtps_pd(xf0);
487            __m128d xd2 = _mm_cvtps_pd(_mm_movehl_ps(xf0, xf0));
488            __m128d xd1 = _mm_cvtps_pd(xf1);
489            __m128d xd3 = _mm_cvtps_pd(_mm_movehl_ps(xf1, xf1));
490
491            xd0 = _mm_mul_pd(xd0, prescale2);
492            xd2 = _mm_mul_pd(xd2, prescale2);
493            xd1 = _mm_mul_pd(xd1, prescale2);
494            xd3 = _mm_mul_pd(xd3, prescale2);
495
496            xi0 = _mm_cvtpd_epi32(xd0);
497            xi2 = _mm_cvtpd_epi32(xd2);
498
499            xi1 = _mm_cvtpd_epi32(xd1);
500            xi3 = _mm_cvtpd_epi32(xd3);
501
502            xd0 = _mm_sub_pd(xd0, _mm_cvtepi32_pd(xi0));
503            xd2 = _mm_sub_pd(xd2, _mm_cvtepi32_pd(xi2));
504            xd1 = _mm_sub_pd(xd1, _mm_cvtepi32_pd(xi1));
505            xd3 = _mm_sub_pd(xd3, _mm_cvtepi32_pd(xi3));
506
507            xf0 = _mm_movelh_ps(_mm_cvtpd_ps(xd0), _mm_cvtpd_ps(xd2));
508            xf1 = _mm_movelh_ps(_mm_cvtpd_ps(xd1), _mm_cvtpd_ps(xd3));
509
510            xf0 = _mm_mul_ps(xf0, postscale4);
511            xf1 = _mm_mul_ps(xf1, postscale4);
512
513            xi0 = _mm_unpacklo_epi64(xi0, xi2);
514            xi1 = _mm_unpacklo_epi64(xi1, xi3);
515            xi0 = _mm_packs_epi32(xi0, xi1);
516
517            _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK)));
518
519            xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127));
520            xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
521            xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255));
522            xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128());
523            xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
524
525            __m128d yd0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1]));
526            __m128d yd1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3]));
527            __m128d yd2 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[4]), _mm_load_sd(expTab + tab_idx[5]));
528            __m128d yd3 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[6]), _mm_load_sd(expTab + tab_idx[7]));
529
530            __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1));
531            __m128 yf1 = _mm_movelh_ps(_mm_cvtpd_ps(yd2), _mm_cvtpd_ps(yd3));
532
533            yf0 = _mm_mul_ps(yf0, _mm_castsi128_ps(_mm_slli_epi32(xi0, 23)));
534            yf1 = _mm_mul_ps(yf1, _mm_castsi128_ps(_mm_slli_epi32(xi1, 23)));
535
536            __m128 zf0 = _mm_add_ps(xf0, mA1);
537            __m128 zf1 = _mm_add_ps(xf1, mA1);
538
539            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA2);
540            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA2);
541
542            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA3);
543            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA3);
544
545            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA4);
546            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA4);
547
548            zf0 = _mm_mul_ps(zf0, yf0);
549            zf1 = _mm_mul_ps(zf1, yf1);
550
551            if( y_aligned )
552            {
553                _mm_store_ps(y + i, zf0);
554                _mm_store_ps(y + i + 4, zf1);
555            }
556            else
557            {
558                _mm_storeu_ps(y + i, zf0);
559                _mm_storeu_ps(y + i + 4, zf1);
560            }
561        }
562    }
563    else
564#endif
565        for( ; i <= n - 4; i += 4 )
566        {
567            double x0 = x[i].f * exp_prescale;
568            double x1 = x[i + 1].f * exp_prescale;
569            double x2 = x[i + 2].f * exp_prescale;
570            double x3 = x[i + 3].f * exp_prescale;
571            int val0, val1, val2, val3, t;
572
573            if( ((x[i].i >> 23) & 255) > 127 + 10 )
574                x0 = x[i].i < 0 ? -exp_max_val : exp_max_val;
575
576            if( ((x[i+1].i >> 23) & 255) > 127 + 10 )
577                x1 = x[i+1].i < 0 ? -exp_max_val : exp_max_val;
578
579            if( ((x[i+2].i >> 23) & 255) > 127 + 10 )
580                x2 = x[i+2].i < 0 ? -exp_max_val : exp_max_val;
581
582            if( ((x[i+3].i >> 23) & 255) > 127 + 10 )
583                x3 = x[i+3].i < 0 ? -exp_max_val : exp_max_val;
584
585            val0 = cvRound(x0);
586            val1 = cvRound(x1);
587            val2 = cvRound(x2);
588            val3 = cvRound(x3);
589
590            x0 = (x0 - val0)*exp_postscale;
591            x1 = (x1 - val1)*exp_postscale;
592            x2 = (x2 - val2)*exp_postscale;
593            x3 = (x3 - val3)*exp_postscale;
594
595            t = (val0 >> EXPTAB_SCALE) + 127;
596            t = !(t & ~255) ? t : t < 0 ? 0 : 255;
597            buf[0].i = t << 23;
598
599            t = (val1 >> EXPTAB_SCALE) + 127;
600            t = !(t & ~255) ? t : t < 0 ? 0 : 255;
601            buf[1].i = t << 23;
602
603            t = (val2 >> EXPTAB_SCALE) + 127;
604            t = !(t & ~255) ? t : t < 0 ? 0 : 255;
605            buf[2].i = t << 23;
606
607            t = (val3 >> EXPTAB_SCALE) + 127;
608            t = !(t & ~255) ? t : t < 0 ? 0 : 255;
609            buf[3].i = t << 23;
610
611            x0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
612            x1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
613
614            y[i] = (float)x0;
615            y[i + 1] = (float)x1;
616
617            x2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
618            x3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
619
620            y[i + 2] = (float)x2;
621            y[i + 3] = (float)x3;
622        }
623
624    for( ; i < n; i++ )
625    {
626        double x0 = x[i].f * exp_prescale;
627        int val0, t;
628
629        if( ((x[i].i >> 23) & 255) > 127 + 10 )
630            x0 = x[i].i < 0 ? -exp_max_val : exp_max_val;
631
632        val0 = cvRound(x0);
633        t = (val0 >> EXPTAB_SCALE) + 127;
634        t = !(t & ~255) ? t : t < 0 ? 0 : 255;
635
636        buf[0].i = t << 23;
637        x0 = (x0 - val0)*exp_postscale;
638
639        y[i] = (float)(buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY(x0));
640    }
641}
642
643void exp( const double *_x, double *y, int n )
644{
645    static const double
646    A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0,
647    A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0,
648    A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0,
649    A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0,
650    A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0,
651    A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0;
652
653#undef EXPPOLY
654#define EXPPOLY(x)  (((((A0*(x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)*(x) + A5)
655
656    int i = 0;
657    Cv64suf buf[4];
658    const Cv64suf* x = (const Cv64suf*)_x;
659
660#if CV_SSE2
661    static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
662    static const __m128d postscale2 = _mm_set1_pd(exp_postscale);
663    static const __m128d maxval2 = _mm_set1_pd(exp_max_val);
664    static const __m128d minval2 = _mm_set1_pd(-exp_max_val);
665
666    static const __m128d mA0 = _mm_set1_pd(A0);
667    static const __m128d mA1 = _mm_set1_pd(A1);
668    static const __m128d mA2 = _mm_set1_pd(A2);
669    static const __m128d mA3 = _mm_set1_pd(A3);
670    static const __m128d mA4 = _mm_set1_pd(A4);
671    static const __m128d mA5 = _mm_set1_pd(A5);
672
673    int CV_DECL_ALIGNED(16) tab_idx[4];
674
675    for( ; i <= n - 4; i += 4 )
676    {
677        __m128d xf0 = _mm_loadu_pd(&x[i].f), xf1 = _mm_loadu_pd(&x[i+2].f);
678        __m128i xi0, xi1;
679        xf0 = _mm_min_pd(_mm_max_pd(xf0, minval2), maxval2);
680        xf1 = _mm_min_pd(_mm_max_pd(xf1, minval2), maxval2);
681        xf0 = _mm_mul_pd(xf0, prescale2);
682        xf1 = _mm_mul_pd(xf1, prescale2);
683
684        xi0 = _mm_cvtpd_epi32(xf0);
685        xi1 = _mm_cvtpd_epi32(xf1);
686        xf0 = _mm_mul_pd(_mm_sub_pd(xf0, _mm_cvtepi32_pd(xi0)), postscale2);
687        xf1 = _mm_mul_pd(_mm_sub_pd(xf1, _mm_cvtepi32_pd(xi1)), postscale2);
688
689        xi0 = _mm_unpacklo_epi64(xi0, xi1);
690        _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi32(EXPTAB_MASK)));
691
692        xi0 = _mm_add_epi32(_mm_srai_epi32(xi0, EXPTAB_SCALE), _mm_set1_epi32(1023));
693        xi0 = _mm_packs_epi32(xi0, xi0);
694        xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
695        xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(2047));
696        xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
697        xi1 = _mm_unpackhi_epi32(xi0, _mm_setzero_si128());
698        xi0 = _mm_unpacklo_epi32(xi0, _mm_setzero_si128());
699
700        __m128d yf0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1]));
701        __m128d yf1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3]));
702        yf0 = _mm_mul_pd(yf0, _mm_castsi128_pd(_mm_slli_epi64(xi0, 52)));
703        yf1 = _mm_mul_pd(yf1, _mm_castsi128_pd(_mm_slli_epi64(xi1, 52)));
704
705        __m128d zf0 = _mm_add_pd(_mm_mul_pd(mA0, xf0), mA1);
706        __m128d zf1 = _mm_add_pd(_mm_mul_pd(mA0, xf1), mA1);
707
708        zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA2);
709        zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA2);
710
711        zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA3);
712        zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA3);
713
714        zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA4);
715        zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA4);
716
717        zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA5);
718        zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA5);
719
720        zf0 = _mm_mul_pd(zf0, yf0);
721        zf1 = _mm_mul_pd(zf1, yf1);
722
723        _mm_storeu_pd(y + i, zf0);
724        _mm_storeu_pd(y + i + 2, zf1);
725    }
726#endif
727    for( ; i <= n - 4; i += 4 )
728    {
729        double x0 = x[i].f * exp_prescale;
730        double x1 = x[i + 1].f * exp_prescale;
731        double x2 = x[i + 2].f * exp_prescale;
732        double x3 = x[i + 3].f * exp_prescale;
733
734        double y0, y1, y2, y3;
735        int val0, val1, val2, val3, t;
736
737        t = (int)(x[i].i >> 52);
738        if( (t & 2047) > 1023 + 10 )
739            x0 = t < 0 ? -exp_max_val : exp_max_val;
740
741        t = (int)(x[i+1].i >> 52);
742        if( (t & 2047) > 1023 + 10 )
743            x1 = t < 0 ? -exp_max_val : exp_max_val;
744
745        t = (int)(x[i+2].i >> 52);
746        if( (t & 2047) > 1023 + 10 )
747            x2 = t < 0 ? -exp_max_val : exp_max_val;
748
749        t = (int)(x[i+3].i >> 52);
750        if( (t & 2047) > 1023 + 10 )
751            x3 = t < 0 ? -exp_max_val : exp_max_val;
752
753        val0 = cvRound(x0);
754        val1 = cvRound(x1);
755        val2 = cvRound(x2);
756        val3 = cvRound(x3);
757
758        x0 = (x0 - val0)*exp_postscale;
759        x1 = (x1 - val1)*exp_postscale;
760        x2 = (x2 - val2)*exp_postscale;
761        x3 = (x3 - val3)*exp_postscale;
762
763        t = (val0 >> EXPTAB_SCALE) + 1023;
764        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
765        buf[0].i = (int64)t << 52;
766
767        t = (val1 >> EXPTAB_SCALE) + 1023;
768        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
769        buf[1].i = (int64)t << 52;
770
771        t = (val2 >> EXPTAB_SCALE) + 1023;
772        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
773        buf[2].i = (int64)t << 52;
774
775        t = (val3 >> EXPTAB_SCALE) + 1023;
776        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
777        buf[3].i = (int64)t << 52;
778
779        y0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
780        y1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
781
782        y[i] = y0;
783        y[i + 1] = y1;
784
785        y2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
786        y3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
787
788        y[i + 2] = y2;
789        y[i + 3] = y3;
790    }
791
792    for( ; i < n; i++ )
793    {
794        double x0 = x[i].f * exp_prescale;
795        int val0, t;
796
797        t = (int)(x[i].i >> 52);
798        if( (t & 2047) > 1023 + 10 )
799            x0 = t < 0 ? -exp_max_val : exp_max_val;
800
801        val0 = cvRound(x0);
802        t = (val0 >> EXPTAB_SCALE) + 1023;
803        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
804
805        buf[0].i = (int64)t << 52;
806        x0 = (x0 - val0)*exp_postscale;
807
808        y[i] = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
809    }
810}
811
812#undef EXPTAB_SCALE
813#undef EXPTAB_MASK
814#undef EXPPOLY_32F_A0
815
816/////////////////////////////////////////// LOG ///////////////////////////////////////
817
818#define LOGTAB_SCALE    8
819#define LOGTAB_MASK         ((1 << LOGTAB_SCALE) - 1)
820#define LOGTAB_MASK2        ((1 << (20 - LOGTAB_SCALE)) - 1)
821#define LOGTAB_MASK2_32F    ((1 << (23 - LOGTAB_SCALE)) - 1)
822
823static const double CV_DECL_ALIGNED(16) icvLogTab[] = {
824    0.0000000000000000000000000000000000000000,    1.000000000000000000000000000000000000000,
825    .00389864041565732288852075271279318258166,    .9961089494163424124513618677042801556420,
826    .00778214044205494809292034119607706088573,    .9922480620155038759689922480620155038760,
827    .01165061721997527263705585198749759001657,    .9884169884169884169884169884169884169884,
828    .01550418653596525274396267235488267033361,    .9846153846153846153846153846153846153846,
829    .01934296284313093139406447562578250654042,    .9808429118773946360153256704980842911877,
830    .02316705928153437593630670221500622574241,    .9770992366412213740458015267175572519084,
831    .02697658769820207233514075539915211265906,    .9733840304182509505703422053231939163498,
832    .03077165866675368732785500469617545604706,    .9696969696969696969696969696969696969697,
833    .03455238150665972812758397481047722976656,    .9660377358490566037735849056603773584906,
834    .03831886430213659461285757856785494368522,    .9624060150375939849624060150375939849624,
835    .04207121392068705056921373852674150839447,    .9588014981273408239700374531835205992509,
836    .04580953603129420126371940114040626212953,    .9552238805970149253731343283582089552239,
837    .04953393512227662748292900118940451648088,    .9516728624535315985130111524163568773234,
838    .05324451451881227759255210685296333394944,    .9481481481481481481481481481481481481481,
839    .05694137640013842427411105973078520037234,    .9446494464944649446494464944649446494465,
840    .06062462181643483993820353816772694699466,    .9411764705882352941176470588235294117647,
841    .06429435070539725460836422143984236754475,    .9377289377289377289377289377289377289377,
842    .06795066190850773679699159401934593915938,    .9343065693430656934306569343065693430657,
843    .07159365318700880442825962290953611955044,    .9309090909090909090909090909090909090909,
844    .07522342123758751775142172846244648098944,    .9275362318840579710144927536231884057971,
845    .07884006170777602129362549021607264876369,    .9241877256317689530685920577617328519856,
846    .08244366921107458556772229485432035289706,    .9208633093525179856115107913669064748201,
847    .08603433734180314373940490213499288074675,    .9175627240143369175627240143369175627240,
848    .08961215868968712416897659522874164395031,    .9142857142857142857142857142857142857143,
849    .09317722485418328259854092721070628613231,    .9110320284697508896797153024911032028470,
850    .09672962645855109897752299730200320482256,    .9078014184397163120567375886524822695035,
851    .10026945316367513738597949668474029749630,    .9045936395759717314487632508833922261484,
852    .10379679368164355934833764649738441221420,    .9014084507042253521126760563380281690141,
853    .10731173578908805021914218968959175981580,    .8982456140350877192982456140350877192982,
854    .11081436634029011301105782649756292812530,    .8951048951048951048951048951048951048951,
855    .11430477128005862852422325204315711744130,    .8919860627177700348432055749128919860627,
856    .11778303565638344185817487641543266363440,    .8888888888888888888888888888888888888889,
857    .12124924363286967987640707633545389398930,    .8858131487889273356401384083044982698962,
858    .12470347850095722663787967121606925502420,    .8827586206896551724137931034482758620690,
859    .12814582269193003360996385708858724683530,    .8797250859106529209621993127147766323024,
860    .13157635778871926146571524895989568904040,    .8767123287671232876712328767123287671233,
861    .13499516453750481925766280255629681050780,    .8737201365187713310580204778156996587031,
862    .13840232285911913123754857224412262439730,    .8707482993197278911564625850340136054422,
863    .14179791186025733629172407290752744302150,    .8677966101694915254237288135593220338983,
864    .14518200984449788903951628071808954700830,    .8648648648648648648648648648648648648649,
865    .14855469432313711530824207329715136438610,    .8619528619528619528619528619528619528620,
866    .15191604202584196858794030049466527998450,    .8590604026845637583892617449664429530201,
867    .15526612891112392955683674244937719777230,    .8561872909698996655518394648829431438127,
868    .15860503017663857283636730244325008243330,    .8533333333333333333333333333333333333333,
869    .16193282026931324346641360989451641216880,    .8504983388704318936877076411960132890365,
870    .16524957289530714521497145597095368430010,    .8476821192052980132450331125827814569536,
871    .16855536102980664403538924034364754334090,    .8448844884488448844884488448844884488449,
872    .17185025692665920060697715143760433420540,    .8421052631578947368421052631578947368421,
873    .17513433212784912385018287750426679849630,    .8393442622950819672131147540983606557377,
874    .17840765747281828179637841458315961062910,    .8366013071895424836601307189542483660131,
875    .18167030310763465639212199675966985523700,    .8338762214983713355048859934853420195440,
876    .18492233849401198964024217730184318497780,    .8311688311688311688311688311688311688312,
877    .18816383241818296356839823602058459073300,    .8284789644012944983818770226537216828479,
878    .19139485299962943898322009772527962923050,    .8258064516129032258064516129032258064516,
879    .19461546769967164038916962454095482826240,    .8231511254019292604501607717041800643087,
880    .19782574332991986754137769821682013571260,    .8205128205128205128205128205128205128205,
881    .20102574606059073203390141770796617493040,    .8178913738019169329073482428115015974441,
882    .20421554142869088876999228432396193966280,    .8152866242038216560509554140127388535032,
883    .20739519434607056602715147164417430758480,    .8126984126984126984126984126984126984127,
884    .21056476910734961416338251183333341032260,    .8101265822784810126582278481012658227848,
885    .21372432939771812687723695489694364368910,    .8075709779179810725552050473186119873817,
886    .21687393830061435506806333251006435602900,    .8050314465408805031446540880503144654088,
887    .22001365830528207823135744547471404075630,    .8025078369905956112852664576802507836991,
888    .22314355131420973710199007200571941211830,    .8000000000000000000000000000000000000000,
889    .22626367865045338145790765338460914790630,    .7975077881619937694704049844236760124611,
890    .22937410106484582006380890106811420992010,    .7950310559006211180124223602484472049689,
891    .23247487874309405442296849741978803649550,    .7925696594427244582043343653250773993808,
892    .23556607131276688371634975283086532726890,    .7901234567901234567901234567901234567901,
893    .23864773785017498464178231643018079921600,    .7876923076923076923076923076923076923077,
894    .24171993688714515924331749374687206000090,    .7852760736196319018404907975460122699387,
895    .24478272641769091566565919038112042471760,    .7828746177370030581039755351681957186544,
896    .24783616390458124145723672882013488560910,    .7804878048780487804878048780487804878049,
897    .25088030628580937353433455427875742316250,    .7781155015197568389057750759878419452888,
898    .25391520998096339667426946107298135757450,    .7757575757575757575757575757575757575758,
899    .25694093089750041913887912414793390780680,    .7734138972809667673716012084592145015106,
900    .25995752443692604627401010475296061486000,    .7710843373493975903614457831325301204819,
901    .26296504550088134477547896494797896593800,    .7687687687687687687687687687687687687688,
902    .26596354849713793599974565040611196309330,    .7664670658682634730538922155688622754491,
903    .26895308734550393836570947314612567424780,    .7641791044776119402985074626865671641791,
904    .27193371548364175804834985683555714786050,    .7619047619047619047619047619047619047619,
905    .27490548587279922676529508862586226314300,    .7596439169139465875370919881305637982196,
906    .27786845100345625159121709657483734190480,    .7573964497041420118343195266272189349112,
907    .28082266290088775395616949026589281857030,    .7551622418879056047197640117994100294985,
908    .28376817313064456316240580235898960381750,    .7529411764705882352941176470588235294118,
909    .28670503280395426282112225635501090437180,    .7507331378299120234604105571847507331378,
910    .28963329258304265634293983566749375313530,    .7485380116959064327485380116959064327485,
911    .29255300268637740579436012922087684273730,    .7463556851311953352769679300291545189504,
912    .29546421289383584252163927885703742504130,    .7441860465116279069767441860465116279070,
913    .29836697255179722709783618483925238251680,    .7420289855072463768115942028985507246377,
914    .30126133057816173455023545102449133992200,    .7398843930635838150289017341040462427746,
915    .30414733546729666446850615102448500692850,    .7377521613832853025936599423631123919308,
916    .30702503529491181888388950937951449304830,    .7356321839080459770114942528735632183908,
917    .30989447772286465854207904158101882785550,    .7335243553008595988538681948424068767908,
918    .31275571000389684739317885942000430077330,    .7314285714285714285714285714285714285714,
919    .31560877898630329552176476681779604405180,    .7293447293447293447293447293447293447293,
920    .31845373111853458869546784626436419785030,    .7272727272727272727272727272727272727273,
921    .32129061245373424782201254856772720813750,    .7252124645892351274787535410764872521246,
922    .32411946865421192853773391107097268104550,    .7231638418079096045197740112994350282486,
923    .32694034499585328257253991068864706903700,    .7211267605633802816901408450704225352113,
924    .32975328637246797969240219572384376078850,    .7191011235955056179775280898876404494382,
925    .33255833730007655635318997155991382896900,    .7170868347338935574229691876750700280112,
926    .33535554192113781191153520921943709254280,    .7150837988826815642458100558659217877095,
927    .33814494400871636381467055798566434532400,    .7130919220055710306406685236768802228412,
928    .34092658697059319283795275623560883104800,    .7111111111111111111111111111111111111111,
929    .34370051385331840121395430287520866841080,    .7091412742382271468144044321329639889197,
930    .34646676734620857063262633346312213689100,    .7071823204419889502762430939226519337017,
931    .34922538978528827602332285096053965389730,    .7052341597796143250688705234159779614325,
932    .35197642315717814209818925519357435405250,    .7032967032967032967032967032967032967033,
933    .35471990910292899856770532096561510115850,    .7013698630136986301369863013698630136986,
934    .35745588892180374385176833129662554711100,    .6994535519125683060109289617486338797814,
935    .36018440357500774995358483465679455548530,    .6975476839237057220708446866485013623978,
936    .36290549368936841911903457003063522279280,    .6956521739130434782608695652173913043478,
937    .36561919956096466943762379742111079394830,    .6937669376693766937669376693766937669377,
938    .36832556115870762614150635272380895912650,    .6918918918918918918918918918918918918919,
939    .37102461812787262962487488948681857436900,    .6900269541778975741239892183288409703504,
940    .37371640979358405898480555151763837784530,    .6881720430107526881720430107526881720430,
941    .37640097516425302659470730759494472295050,    .6863270777479892761394101876675603217158,
942    .37907835293496944251145919224654790014030,    .6844919786096256684491978609625668449198,
943    .38174858149084833769393299007788300514230,    .6826666666666666666666666666666666666667,
944    .38441169891033200034513583887019194662580,    .6808510638297872340425531914893617021277,
945    .38706774296844825844488013899535872042180,    .6790450928381962864721485411140583554377,
946    .38971675114002518602873692543653305619950,    .6772486772486772486772486772486772486772,
947    .39235876060286384303665840889152605086580,    .6754617414248021108179419525065963060686,
948    .39499380824086893770896722344332374632350,    .6736842105263157894736842105263157894737,
949    .39762193064713846624158577469643205404280,    .6719160104986876640419947506561679790026,
950    .40024316412701266276741307592601515352730,    .6701570680628272251308900523560209424084,
951    .40285754470108348090917615991202183067800,    .6684073107049608355091383812010443864230,
952    .40546510810816432934799991016916465014230,    .6666666666666666666666666666666666666667,
953    .40806588980822172674223224930756259709600,    .6649350649350649350649350649350649350649,
954    .41065992498526837639616360320360399782650,    .6632124352331606217616580310880829015544,
955    .41324724855021932601317757871584035456180,    .6614987080103359173126614987080103359173,
956    .41582789514371093497757669865677598863850,    .6597938144329896907216494845360824742268,
957    .41840189913888381489925905043492093682300,    .6580976863753213367609254498714652956298,
958    .42096929464412963239894338585145305842150,    .6564102564102564102564102564102564102564,
959    .42353011550580327293502591601281892508280,    .6547314578005115089514066496163682864450,
960    .42608439531090003260516141381231136620050,    .6530612244897959183673469387755102040816,
961    .42863216738969872610098832410585600882780,    .6513994910941475826972010178117048346056,
962    .43117346481837132143866142541810404509300,    .6497461928934010152284263959390862944162,
963    .43370832042155937902094819946796633303180,    .6481012658227848101265822784810126582278,
964    .43623676677491801667585491486534010618930,    .6464646464646464646464646464646464646465,
965    .43875883620762790027214350629947148263450,    .6448362720403022670025188916876574307305,
966    .44127456080487520440058801796112675219780,    .6432160804020100502512562814070351758794,
967    .44378397241030093089975139264424797147500,    .6416040100250626566416040100250626566416,
968    .44628710262841947420398014401143882423650,    .6400000000000000000000000000000000000000,
969    .44878398282700665555822183705458883196130,    .6384039900249376558603491271820448877805,
970    .45127464413945855836729492693848442286250,    .6368159203980099502487562189054726368159,
971    .45375911746712049854579618113348260521900,    .6352357320099255583126550868486352357320,
972    .45623743348158757315857769754074979573500,    .6336633663366336633663366336633663366337,
973    .45870962262697662081833982483658473938700,    .6320987654320987654320987654320987654321,
974    .46117571512217014895185229761409573256980,    .6305418719211822660098522167487684729064,
975    .46363574096303250549055974261136725544930,    .6289926289926289926289926289926289926290,
976    .46608972992459918316399125615134835243230,    .6274509803921568627450980392156862745098,
977    .46853771156323925639597405279346276074650,    .6259168704156479217603911980440097799511,
978    .47097971521879100631480241645476780831830,    .6243902439024390243902439024390243902439,
979    .47341577001667212165614273544633761048330,    .6228710462287104622871046228710462287105,
980    .47584590486996386493601107758877333253630,    .6213592233009708737864077669902912621359,
981    .47827014848147025860569669930555392056700,    .6198547215496368038740920096852300242131,
982    .48068852934575190261057286988943815231330,    .6183574879227053140096618357487922705314,
983    .48310107575113581113157579238759353756900,    .6168674698795180722891566265060240963855,
984    .48550781578170076890899053978500887751580,    .6153846153846153846153846153846153846154,
985    .48790877731923892879351001283794175833480,    .6139088729016786570743405275779376498801,
986    .49030398804519381705802061333088204264650,    .6124401913875598086124401913875598086124,
987    .49269347544257524607047571407747454941280,    .6109785202863961813842482100238663484487,
988    .49507726679785146739476431321236304938800,    .6095238095238095238095238095238095238095,
989    .49745538920281889838648226032091770321130,    .6080760095011876484560570071258907363420,
990    .49982786955644931126130359189119189977650,    .6066350710900473933649289099526066350711,
991    .50219473456671548383667413872899487614650,    .6052009456264775413711583924349881796690,
992    .50455601075239520092452494282042607665050,    .6037735849056603773584905660377358490566,
993    .50691172444485432801997148999362252652650,    .6023529411764705882352941176470588235294,
994    .50926190178980790257412536448100581765150,    .6009389671361502347417840375586854460094,
995    .51160656874906207391973111953120678663250,    .5995316159250585480093676814988290398126,
996    .51394575110223428282552049495279788970950,    .5981308411214953271028037383177570093458,
997    .51627947444845445623684554448118433356300,    .5967365967365967365967365967365967365967,
998    .51860776420804555186805373523384332656850,    .5953488372093023255813953488372093023256,
999    .52093064562418522900344441950437612831600,    .5939675174013921113689095127610208816705,
1000    .52324814376454775732838697877014055848100,    .5925925925925925925925925925925925925926,
1001    .52556028352292727401362526507000438869000,    .5912240184757505773672055427251732101617,
1002    .52786708962084227803046587723656557500350,    .5898617511520737327188940092165898617512,
1003    .53016858660912158374145519701414741575700,    .5885057471264367816091954022988505747126,
1004    .53246479886947173376654518506256863474850,    .5871559633027522935779816513761467889908,
1005    .53475575061602764748158733709715306758900,    .5858123569794050343249427917620137299771,
1006    .53704146589688361856929077475797384977350,    .5844748858447488584474885844748858447489,
1007    .53932196859560876944783558428753167390800,    .5831435079726651480637813211845102505695,
1008    .54159728243274429804188230264117009937750,    .5818181818181818181818181818181818181818,
1009    .54386743096728351609669971367111429572100,    .5804988662131519274376417233560090702948,
1010    .54613243759813556721383065450936555862450,    .5791855203619909502262443438914027149321,
1011    .54839232556557315767520321969641372561450,    .5778781038374717832957110609480812641084,
1012    .55064711795266219063194057525834068655950,    .5765765765765765765765765765765765765766,
1013    .55289683768667763352766542084282264113450,    .5752808988764044943820224719101123595506,
1014    .55514150754050151093110798683483153581600,    .5739910313901345291479820627802690582960,
1015    .55738115013400635344709144192165695130850,    .5727069351230425055928411633109619686801,
1016    .55961578793542265941596269840374588966350,    .5714285714285714285714285714285714285714,
1017    .56184544326269181269140062795486301183700,    .5701559020044543429844097995545657015590,
1018    .56407013828480290218436721261241473257550,    .5688888888888888888888888888888888888889,
1019    .56628989502311577464155334382667206227800,    .5676274944567627494456762749445676274945,
1020    .56850473535266865532378233183408156037350,    .5663716814159292035398230088495575221239,
1021    .57071468100347144680739575051120482385150,    .5651214128035320088300220750551876379691,
1022    .57291975356178548306473885531886480748650,    .5638766519823788546255506607929515418502,
1023    .57511997447138785144460371157038025558000,    .5626373626373626373626373626373626373626,
1024    .57731536503482350219940144597785547375700,    .5614035087719298245614035087719298245614,
1025    .57950594641464214795689713355386629700650,    .5601750547045951859956236323851203501094,
1026    .58169173963462239562716149521293118596100,    .5589519650655021834061135371179039301310,
1027    .58387276558098266665552955601015128195300,    .5577342047930283224400871459694989106754,
1028    .58604904500357812846544902640744112432000,    .5565217391304347826086956521739130434783,
1029    .58822059851708596855957011939608491957200,    .5553145336225596529284164859002169197397,
1030    .59038744660217634674381770309992134571100,    .5541125541125541125541125541125541125541,
1031    .59254960960667157898740242671919986605650,    .5529157667386609071274298056155507559395,
1032    .59470710774669277576265358220553025603300,    .5517241379310344827586206896551724137931,
1033    .59685996110779382384237123915227130055450,    .5505376344086021505376344086021505376344,
1034    .59900818964608337768851242799428291618800,    .5493562231759656652360515021459227467811,
1035    .60115181318933474940990890900138765573500,    .5481798715203426124197002141327623126338,
1036    .60329085143808425240052883964381180703650,    .5470085470085470085470085470085470085470,
1037    .60542532396671688843525771517306566238400,    .5458422174840085287846481876332622601279,
1038    .60755525022454170969155029524699784815300,    .5446808510638297872340425531914893617021,
1039    .60968064953685519036241657886421307921400,    .5435244161358811040339702760084925690021,
1040    .61180154110599282990534675263916142284850,    .5423728813559322033898305084745762711864,
1041    .61391794401237043121710712512140162289150,    .5412262156448202959830866807610993657505,
1042    .61602987721551394351138242200249806046500,    .5400843881856540084388185654008438818565,
1043    .61813735955507864705538167982012964785100,    .5389473684210526315789473684210526315789,
1044    .62024040975185745772080281312810257077200,    .5378151260504201680672268907563025210084,
1045    .62233904640877868441606324267922900617100,    .5366876310272536687631027253668763102725,
1046    .62443328801189346144440150965237990021700,    .5355648535564853556485355648535564853556,
1047    .62652315293135274476554741340805776417250,    .5344467640918580375782881002087682672234,
1048    .62860865942237409420556559780379757285100,    .5333333333333333333333333333333333333333,
1049    .63068982562619868570408243613201193511500,    .5322245322245322245322245322245322245322,
1050    .63276666957103777644277897707070223987100,    .5311203319502074688796680497925311203320,
1051    .63483920917301017716738442686619237065300,    .5300207039337474120082815734989648033126,
1052    .63690746223706917739093569252872839570050,    .5289256198347107438016528925619834710744,
1053    .63897144645792069983514238629140891134750,    .5278350515463917525773195876288659793814,
1054    .64103117942093124081992527862894348800200,    .5267489711934156378600823045267489711934,
1055    .64308667860302726193566513757104985415950,    .5256673511293634496919917864476386036961,
1056    .64513796137358470073053240412264131009600,    .5245901639344262295081967213114754098361,
1057    .64718504499530948859131740391603671014300,    .5235173824130879345603271983640081799591,
1058    .64922794662510974195157587018911726772800,    .5224489795918367346938775510204081632653,
1059    .65126668331495807251485530287027359008800,    .5213849287169042769857433808553971486762,
1060    .65330127201274557080523663898929953575150,    .5203252032520325203252032520325203252033,
1061    .65533172956312757406749369692988693714150,    .5192697768762677484787018255578093306288,
1062    .65735807270835999727154330685152672231200,    .5182186234817813765182186234817813765182,
1063    .65938031808912778153342060249997302889800,    .5171717171717171717171717171717171717172,
1064    .66139848224536490484126716182800009846700,    .5161290322580645161290322580645161290323,
1065    .66341258161706617713093692145776003599150,    .5150905432595573440643863179074446680080,
1066    .66542263254509037562201001492212526500250,    .5140562248995983935742971887550200803213,
1067    .66742865127195616370414654738851822912700,    .5130260521042084168336673346693386773547,
1068    .66943065394262923906154583164607174694550,    .5120000000000000000000000000000000000000,
1069    .67142865660530226534774556057527661323550,    .5109780439121756487025948103792415169661,
1070    .67342267521216669923234121597488410770900,    .5099601593625498007968127490039840637450,
1071    .67541272562017662384192817626171745359900,    .5089463220675944333996023856858846918489,
1072    .67739882359180603188519853574689477682100,    .5079365079365079365079365079365079365079,
1073    .67938098479579733801614338517538271844400,    .5069306930693069306930693069306930693069,
1074    .68135922480790300781450241629499942064300,    .5059288537549407114624505928853754940711,
1075    .68333355911162063645036823800182901322850,    .5049309664694280078895463510848126232742,
1076    .68530400309891936760919861626462079584600,    .5039370078740157480314960629921259842520,
1077    .68727057207096020619019327568821609020250,    .5029469548133595284872298624754420432220,
1078    .68923328123880889251040571252815425395950,    .5019607843137254901960784313725490196078,
1079    .69314718055994530941723212145818, 5.0e-01,
1080};
1081
1082
1083
1084#define LOGTAB_TRANSLATE(x,h) (((x) - 1.)*icvLogTab[(h)+1])
1085static const double ln_2 = 0.69314718055994530941723212145818;
1086
1087void log( const float *_x, float *y, int n )
1088{
1089    static const float shift[] = { 0, -1.f/512 };
1090    static const float
1091    A0 = 0.3333333333333333333333333f,
1092    A1 = -0.5f,
1093    A2 = 1.f;
1094
1095#undef LOGPOLY
1096#define LOGPOLY(x) (((A0*(x) + A1)*(x) + A2)*(x))
1097
1098    int i = 0;
1099    Cv32suf buf[4];
1100    const int* x = (const int*)_x;
1101
1102#if CV_SSE2
1103    static const __m128d ln2_2 = _mm_set1_pd(ln_2);
1104    static const __m128 _1_4 = _mm_set1_ps(1.f);
1105    static const __m128 shift4 = _mm_set1_ps(-1.f/512);
1106
1107    static const __m128 mA0 = _mm_set1_ps(A0);
1108    static const __m128 mA1 = _mm_set1_ps(A1);
1109    static const __m128 mA2 = _mm_set1_ps(A2);
1110
1111    int CV_DECL_ALIGNED(16) idx[4];
1112
1113    for( ; i <= n - 4; i += 4 )
1114    {
1115        __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i));
1116        __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 23), _mm_set1_epi32(255)), _mm_set1_epi32(127));
1117        __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2);
1118        __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0,yi0)), ln2_2);
1119
1120        __m128i xi0 = _mm_or_si128(_mm_and_si128(h0, _mm_set1_epi32(LOGTAB_MASK2_32F)), _mm_set1_epi32(127 << 23));
1121
1122        h0 = _mm_and_si128(_mm_srli_epi32(h0, 23 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK*2));
1123        _mm_store_si128((__m128i*)idx, h0);
1124        h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510));
1125
1126        __m128d t0, t1, t2, t3, t4;
1127        t0 = _mm_load_pd(icvLogTab + idx[0]);
1128        t2 = _mm_load_pd(icvLogTab + idx[1]);
1129        t1 = _mm_unpackhi_pd(t0, t2);
1130        t0 = _mm_unpacklo_pd(t0, t2);
1131        t2 = _mm_load_pd(icvLogTab + idx[2]);
1132        t4 = _mm_load_pd(icvLogTab + idx[3]);
1133        t3 = _mm_unpackhi_pd(t2, t4);
1134        t2 = _mm_unpacklo_pd(t2, t4);
1135
1136        yd0 = _mm_add_pd(yd0, t0);
1137        yd1 = _mm_add_pd(yd1, t2);
1138
1139        __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1));
1140
1141        __m128 xf0 = _mm_sub_ps(_mm_castsi128_ps(xi0), _1_4);
1142        xf0 = _mm_mul_ps(xf0, _mm_movelh_ps(_mm_cvtpd_ps(t1), _mm_cvtpd_ps(t3)));
1143        xf0 = _mm_add_ps(xf0, _mm_and_ps(_mm_castsi128_ps(h0), shift4));
1144
1145        __m128 zf0 = _mm_mul_ps(xf0, mA0);
1146        zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA1), xf0);
1147        zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA2), xf0);
1148        yf0 = _mm_add_ps(yf0, zf0);
1149
1150        _mm_storeu_ps(y + i, yf0);
1151    }
1152#endif
1153    for( ; i <= n - 4; i += 4 )
1154    {
1155        double x0, x1, x2, x3;
1156        double y0, y1, y2, y3;
1157        int h0, h1, h2, h3;
1158
1159        h0 = x[i];
1160        h1 = x[i+1];
1161        buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23);
1162        buf[1].i = (h1 & LOGTAB_MASK2_32F) | (127 << 23);
1163
1164        y0 = (((h0 >> 23) & 0xff) - 127) * ln_2;
1165        y1 = (((h1 >> 23) & 0xff) - 127) * ln_2;
1166
1167        h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
1168        h1 = (h1 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
1169
1170        y0 += icvLogTab[h0];
1171        y1 += icvLogTab[h1];
1172
1173        h2 = x[i+2];
1174        h3 = x[i+3];
1175
1176        x0 = LOGTAB_TRANSLATE( buf[0].f, h0 );
1177        x1 = LOGTAB_TRANSLATE( buf[1].f, h1 );
1178
1179        buf[2].i = (h2 & LOGTAB_MASK2_32F) | (127 << 23);
1180        buf[3].i = (h3 & LOGTAB_MASK2_32F) | (127 << 23);
1181
1182        y2 = (((h2 >> 23) & 0xff) - 127) * ln_2;
1183        y3 = (((h3 >> 23) & 0xff) - 127) * ln_2;
1184
1185        h2 = (h2 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
1186        h3 = (h3 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
1187
1188        y2 += icvLogTab[h2];
1189        y3 += icvLogTab[h3];
1190
1191        x2 = LOGTAB_TRANSLATE( buf[2].f, h2 );
1192        x3 = LOGTAB_TRANSLATE( buf[3].f, h3 );
1193
1194        x0 += shift[h0 == 510];
1195        x1 += shift[h1 == 510];
1196        y0 += LOGPOLY( x0 );
1197        y1 += LOGPOLY( x1 );
1198
1199        y[i] = (float) y0;
1200        y[i + 1] = (float) y1;
1201
1202        x2 += shift[h2 == 510];
1203        x3 += shift[h3 == 510];
1204        y2 += LOGPOLY( x2 );
1205        y3 += LOGPOLY( x3 );
1206
1207        y[i + 2] = (float) y2;
1208        y[i + 3] = (float) y3;
1209    }
1210
1211    for( ; i < n; i++ )
1212    {
1213        int h0 = x[i];
1214        double y0;
1215        float x0;
1216
1217        y0 = (((h0 >> 23) & 0xff) - 127) * ln_2;
1218
1219        buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23);
1220        h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
1221
1222        y0 += icvLogTab[h0];
1223        x0 = (float)LOGTAB_TRANSLATE( buf[0].f, h0 );
1224        x0 += shift[h0 == 510];
1225        y0 += LOGPOLY( x0 );
1226
1227        y[i] = (float)y0;
1228    }
1229}
1230
1231void log( const double *x, double *y, int n )
1232{
1233    static const double shift[] = { 0, -1./512 };
1234    static const double
1235    A7 = 1.0,
1236    A6 = -0.5,
1237    A5 = 0.333333333333333314829616256247390992939472198486328125,
1238    A4 = -0.25,
1239    A3 = 0.2,
1240    A2 = -0.1666666666666666574148081281236954964697360992431640625,
1241    A1 = 0.1428571428571428769682682968777953647077083587646484375,
1242    A0 = -0.125;
1243
1244#undef LOGPOLY
1245#define LOGPOLY(x,k) ((x)+=shift[k], xq = (x)*(x),\
1246(((A0*xq + A2)*xq + A4)*xq + A6)*xq + \
1247(((A1*xq + A3)*xq + A5)*xq + A7)*(x))
1248
1249    int i = 0;
1250    DBLINT buf[4];
1251    DBLINT *X = (DBLINT *) x;
1252
1253#if CV_SSE2
1254    static const __m128d ln2_2 = _mm_set1_pd(ln_2);
1255    static const __m128d _1_2 = _mm_set1_pd(1.);
1256    static const __m128d shift2 = _mm_set1_pd(-1./512);
1257
1258    static const __m128i log_and_mask2 = _mm_set_epi32(LOGTAB_MASK2, 0xffffffff, LOGTAB_MASK2, 0xffffffff);
1259    static const __m128i log_or_mask2 = _mm_set_epi32(1023 << 20, 0, 1023 << 20, 0);
1260
1261    static const __m128d mA0 = _mm_set1_pd(A0);
1262    static const __m128d mA1 = _mm_set1_pd(A1);
1263    static const __m128d mA2 = _mm_set1_pd(A2);
1264    static const __m128d mA3 = _mm_set1_pd(A3);
1265    static const __m128d mA4 = _mm_set1_pd(A4);
1266    static const __m128d mA5 = _mm_set1_pd(A5);
1267    static const __m128d mA6 = _mm_set1_pd(A6);
1268    static const __m128d mA7 = _mm_set1_pd(A7);
1269
1270    int CV_DECL_ALIGNED(16) idx[4];
1271
1272    for( ; i <= n - 4; i += 4 )
1273    {
1274        __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i));
1275        __m128i h1 = _mm_loadu_si128((const __m128i*)(x + i + 2));
1276
1277        __m128d xd0 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h0, log_and_mask2), log_or_mask2));
1278        __m128d xd1 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h1, log_and_mask2), log_or_mask2));
1279
1280        h0 = _mm_unpackhi_epi32(_mm_unpacklo_epi32(h0, h1), _mm_unpackhi_epi32(h0, h1));
1281
1282        __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 20),
1283                                                  _mm_set1_epi32(2047)), _mm_set1_epi32(1023));
1284        __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2);
1285        __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0, yi0)), ln2_2);
1286
1287        h0 = _mm_and_si128(_mm_srli_epi32(h0, 20 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK * 2));
1288        _mm_store_si128((__m128i*)idx, h0);
1289        h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510));
1290
1291        __m128d t0, t1, t2, t3, t4;
1292        t0 = _mm_load_pd(icvLogTab + idx[0]);
1293        t2 = _mm_load_pd(icvLogTab + idx[1]);
1294        t1 = _mm_unpackhi_pd(t0, t2);
1295        t0 = _mm_unpacklo_pd(t0, t2);
1296        t2 = _mm_load_pd(icvLogTab + idx[2]);
1297        t4 = _mm_load_pd(icvLogTab + idx[3]);
1298        t3 = _mm_unpackhi_pd(t2, t4);
1299        t2 = _mm_unpacklo_pd(t2, t4);
1300
1301        yd0 = _mm_add_pd(yd0, t0);
1302        yd1 = _mm_add_pd(yd1, t2);
1303
1304        xd0 = _mm_mul_pd(_mm_sub_pd(xd0, _1_2), t1);
1305        xd1 = _mm_mul_pd(_mm_sub_pd(xd1, _1_2), t3);
1306
1307        xd0 = _mm_add_pd(xd0, _mm_and_pd(_mm_castsi128_pd(_mm_unpacklo_epi32(h0, h0)), shift2));
1308        xd1 = _mm_add_pd(xd1, _mm_and_pd(_mm_castsi128_pd(_mm_unpackhi_epi32(h0, h0)), shift2));
1309
1310        __m128d zd0 = _mm_mul_pd(xd0, mA0);
1311        __m128d zd1 = _mm_mul_pd(xd1, mA0);
1312        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA1), xd0);
1313        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA1), xd1);
1314        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA2), xd0);
1315        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA2), xd1);
1316        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA3), xd0);
1317        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA3), xd1);
1318        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA4), xd0);
1319        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA4), xd1);
1320        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA5), xd0);
1321        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA5), xd1);
1322        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA6), xd0);
1323        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA6), xd1);
1324        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA7), xd0);
1325        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA7), xd1);
1326
1327        yd0 = _mm_add_pd(yd0, zd0);
1328        yd1 = _mm_add_pd(yd1, zd1);
1329
1330        _mm_storeu_pd(y + i, yd0);
1331        _mm_storeu_pd(y + i + 2, yd1);
1332    }
1333#endif
1334    for( ; i <= n - 4; i += 4 )
1335    {
1336        double xq;
1337        double x0, x1, x2, x3;
1338        double y0, y1, y2, y3;
1339        int h0, h1, h2, h3;
1340
1341        h0 = X[i].i.lo;
1342        h1 = X[i + 1].i.lo;
1343        buf[0].i.lo = h0;
1344        buf[1].i.lo = h1;
1345
1346        h0 = X[i].i.hi;
1347        h1 = X[i + 1].i.hi;
1348        buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20);
1349        buf[1].i.hi = (h1 & LOGTAB_MASK2) | (1023 << 20);
1350
1351        y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2;
1352        y1 = (((h1 >> 20) & 0x7ff) - 1023) * ln_2;
1353
1354        h2 = X[i + 2].i.lo;
1355        h3 = X[i + 3].i.lo;
1356        buf[2].i.lo = h2;
1357        buf[3].i.lo = h3;
1358
1359        h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
1360        h1 = (h1 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
1361
1362        y0 += icvLogTab[h0];
1363        y1 += icvLogTab[h1];
1364
1365        h2 = X[i + 2].i.hi;
1366        h3 = X[i + 3].i.hi;
1367
1368        x0 = LOGTAB_TRANSLATE( buf[0].d, h0 );
1369        x1 = LOGTAB_TRANSLATE( buf[1].d, h1 );
1370
1371        buf[2].i.hi = (h2 & LOGTAB_MASK2) | (1023 << 20);
1372        buf[3].i.hi = (h3 & LOGTAB_MASK2) | (1023 << 20);
1373
1374        y2 = (((h2 >> 20) & 0x7ff) - 1023) * ln_2;
1375        y3 = (((h3 >> 20) & 0x7ff) - 1023) * ln_2;
1376
1377        h2 = (h2 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
1378        h3 = (h3 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
1379
1380        y2 += icvLogTab[h2];
1381        y3 += icvLogTab[h3];
1382
1383        x2 = LOGTAB_TRANSLATE( buf[2].d, h2 );
1384        x3 = LOGTAB_TRANSLATE( buf[3].d, h3 );
1385
1386        y0 += LOGPOLY( x0, h0 == 510 );
1387        y1 += LOGPOLY( x1, h1 == 510 );
1388
1389        y[i] = y0;
1390        y[i + 1] = y1;
1391
1392        y2 += LOGPOLY( x2, h2 == 510 );
1393        y3 += LOGPOLY( x3, h3 == 510 );
1394
1395        y[i + 2] = y2;
1396        y[i + 3] = y3;
1397    }
1398
1399    for( ; i < n; i++ )
1400    {
1401        int h0 = X[i].i.hi;
1402        double xq;
1403        double x0, y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2;
1404
1405        buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20);
1406        buf[0].i.lo = X[i].i.lo;
1407        h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
1408
1409        y0 += icvLogTab[h0];
1410        x0 = LOGTAB_TRANSLATE( buf[0].d, h0 );
1411        y0 += LOGPOLY( x0, h0 == 510 );
1412        y[i] = y0;
1413    }
1414}
1415
1416}}
1417