1c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#include <typeinfo>
2c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#include <iostream>
3c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#include <Eigen/Core>
4c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#include "BenchTimer.h"
5c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathusing namespace Eigen;
6c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathusing namespace std;
7c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
8c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<typename T>
92b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DONT_INLINE typename T::Scalar sqsumNorm(T& v)
10c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
11c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return v.norm();
12c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
13c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
14c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<typename T>
152b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DONT_INLINE typename T::Scalar stableNorm(T& v)
162b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang{
172b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return v.stableNorm();
182b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
192b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang
202b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangtemplate<typename T>
212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DONT_INLINE typename T::Scalar hypotNorm(T& v)
22c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
23c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return v.hypotNorm();
24c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
25c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
26c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<typename T>
272b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangEIGEN_DONT_INLINE typename T::Scalar blueNorm(T& v)
28c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
29c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return v.blueNorm();
30c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
31c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
32c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<typename T>
33c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathEIGEN_DONT_INLINE typename T::Scalar lapackNorm(T& v)
34c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
35c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  typedef typename T::Scalar Scalar;
36c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int n = v.size();
37c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  Scalar scale = 0;
38c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  Scalar ssq = 1;
39c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  for (int i=0;i<n;++i)
40c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  {
412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    Scalar ax = std::abs(v.coeff(i));
42c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    if (scale >= ax)
43c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    {
442b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      ssq += numext::abs2(ax/scale);
45c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    }
46c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    else
47c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    {
482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      ssq = Scalar(1) + ssq * numext::abs2(scale/ax);
49c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath      scale = ax;
50c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    }
51c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  }
522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return scale * std::sqrt(ssq);
53c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
54c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
55c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<typename T>
56c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathEIGEN_DONT_INLINE typename T::Scalar twopassNorm(T& v)
57c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
58c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  typedef typename T::Scalar Scalar;
592b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Scalar s = v.array().abs().maxCoeff();
60c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return s*(v/s).norm();
61c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
62c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
63c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<typename T>
64c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathEIGEN_DONT_INLINE typename T::Scalar bl2passNorm(T& v)
65c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
66c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return v.stableNorm();
67c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
68c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
69c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<typename T>
70c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathEIGEN_DONT_INLINE typename T::Scalar divacNorm(T& v)
71c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
72c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int n =v.size() / 2;
73c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  for (int i=0;i<n;++i)
74c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    v(i) = v(2*i)*v(2*i) + v(2*i+1)*v(2*i+1);
75c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  n = n/2;
76c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  while (n>0)
77c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  {
78c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    for (int i=0;i<n;++i)
79c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath      v(i) = v(2*i) + v(2*i+1);
80c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    n = n/2;
81c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  }
822b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  return std::sqrt(v(0));
83c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
84c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
852b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangnamespace Eigen {
862b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wangnamespace internal {
87c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifdef EIGEN_VECTORIZE
882b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangPacket4f plt(const Packet4f& a, Packet4f& b) { return _mm_cmplt_ps(a,b); }
892b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangPacket2d plt(const Packet2d& a, Packet2d& b) { return _mm_cmplt_pd(a,b); }
90c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
912b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangPacket4f pandnot(const Packet4f& a, Packet4f& b) { return _mm_andnot_ps(a,b); }
922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao WangPacket2d pandnot(const Packet2d& a, Packet2d& b) { return _mm_andnot_pd(a,b); }
93c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif
942b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
952b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang}
96c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
97c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<typename T>
98c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathEIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v)
99c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
100c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  #ifndef EIGEN_VECTORIZE
101c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return v.blueNorm();
102c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  #else
103c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  typedef typename T::Scalar Scalar;
104c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
105c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  static int nmax = 0;
106c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  static Scalar b1, b2, s1m, s2m, overfl, rbig, relerr;
107c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int n;
108c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
109c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  if(nmax <= 0)
110c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  {
111c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    int nbig, ibeta, it, iemin, iemax, iexp;
112c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    Scalar abig, eps;
113c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
114c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    nbig  = std::numeric_limits<int>::max();            // largest integer
115c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    ibeta = std::numeric_limits<Scalar>::radix; //NumTraits<Scalar>::Base;                    // base for floating-point numbers
116c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    it    = std::numeric_limits<Scalar>::digits; //NumTraits<Scalar>::Mantissa;                // number of base-beta digits in mantissa
117c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    iemin = std::numeric_limits<Scalar>::min_exponent;  // minimum exponent
118c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    iemax = std::numeric_limits<Scalar>::max_exponent;  // maximum exponent
119c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    rbig  = std::numeric_limits<Scalar>::max();         // largest floating-point number
120c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
121c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    // Check the basic machine-dependent constants.
122c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    if(iemin > 1 - 2*it || 1+it>iemax || (it==2 && ibeta<5)
123c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath      || (it<=4 && ibeta <= 3 ) || it<2)
124c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    {
125c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath      eigen_assert(false && "the algorithm cannot be guaranteed on this computer");
126c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    }
127c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    iexp  = -((1-iemin)/2);
128c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    b1    = std::pow(ibeta, iexp);  // lower boundary of midrange
129c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    iexp  = (iemax + 1 - it)/2;
130c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    b2    = std::pow(ibeta,iexp);   // upper boundary of midrange
131c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
132c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    iexp  = (2-iemin)/2;
133c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    s1m   = std::pow(ibeta,iexp);   // scaling factor for lower range
134c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    iexp  = - ((iemax+it)/2);
135c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    s2m   = std::pow(ibeta,iexp);   // scaling factor for upper range
136c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
137c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    overfl  = rbig*s2m;          // overfow boundary for abig
138c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    eps     = std::pow(ibeta, 1-it);
1392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    relerr  = std::sqrt(eps);      // tolerance for neglecting asml
140c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    abig    = 1.0/eps - 1.0;
141c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    if (Scalar(nbig)>abig)  nmax = abig;  // largest safe n
142c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    else                    nmax = nbig;
143c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  }
144c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
145c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  typedef typename internal::packet_traits<Scalar>::type Packet;
146c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  const int ps = internal::packet_traits<Scalar>::size;
1472b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet pasml = internal::pset1<Packet>(Scalar(0));
1482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet pamed = internal::pset1<Packet>(Scalar(0));
1492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet pabig = internal::pset1<Packet>(Scalar(0));
1502b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet ps2m = internal::pset1<Packet>(s2m);
1512b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet ps1m = internal::pset1<Packet>(s1m);
1522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet pb2  = internal::pset1<Packet>(b2);
1532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  Packet pb1  = internal::pset1<Packet>(b1);
154c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  for(int j=0; j<v.size(); j+=ps)
155c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  {
156c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    Packet ax = internal::pabs(v.template packet<Aligned>(j));
157c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    Packet ax_s2m = internal::pmul(ax,ps2m);
158c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    Packet ax_s1m = internal::pmul(ax,ps1m);
159c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    Packet maskBig = internal::plt(pb2,ax);
160c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    Packet maskSml = internal::plt(ax,pb1);
161c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
162c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//     Packet maskMed = internal::pand(maskSml,maskBig);
163c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//     Packet scale = internal::pset1(Scalar(0));
164c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//     scale = internal::por(scale, internal::pand(maskBig,ps2m));
165c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//     scale = internal::por(scale, internal::pand(maskSml,ps1m));
166c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//     scale = internal::por(scale, internal::pandnot(internal::pset1(Scalar(1)),maskMed));
167c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//     ax = internal::pmul(ax,scale);
168c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//     ax = internal::pmul(ax,ax);
169c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//     pabig = internal::padd(pabig, internal::pand(maskBig, ax));
170c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//     pasml = internal::padd(pasml, internal::pand(maskSml, ax));
171c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//     pamed = internal::padd(pamed, internal::pandnot(ax,maskMed));
172c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
173c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
174c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    pabig = internal::padd(pabig, internal::pand(maskBig, internal::pmul(ax_s2m,ax_s2m)));
175c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    pasml = internal::padd(pasml, internal::pand(maskSml, internal::pmul(ax_s1m,ax_s1m)));
176c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    pamed = internal::padd(pamed, internal::pandnot(internal::pmul(ax,ax),internal::pand(maskSml,maskBig)));
177c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  }
178c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  Scalar abig = internal::predux(pabig);
179c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  Scalar asml = internal::predux(pasml);
180c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  Scalar amed = internal::predux(pamed);
181c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  if(abig > Scalar(0))
182c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  {
1832b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    abig = std::sqrt(abig);
184c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    if(abig > overfl)
185c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    {
186c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath      eigen_assert(false && "overflow");
187c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath      return rbig;
188c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    }
189c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    if(amed > Scalar(0))
190c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    {
191c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath      abig = abig/s2m;
1922b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      amed = std::sqrt(amed);
193c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    }
194c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    else
195c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    {
196c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath      return abig/s2m;
197c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    }
198c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
199c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  }
200c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  else if(asml > Scalar(0))
201c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  {
202c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    if (amed > Scalar(0))
203c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    {
2042b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      abig = std::sqrt(amed);
2052b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      amed = std::sqrt(asml) / s1m;
206c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    }
207c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    else
208c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    {
2092b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang      return std::sqrt(asml)/s1m;
210c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    }
211c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  }
212c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  else
213c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  {
2142b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    return std::sqrt(amed);
215c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  }
216c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  asml = std::min(abig, amed);
217c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  abig = std::max(abig, amed);
218c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  if(asml <= abig*relerr)
219c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    return abig;
220c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  else
2212b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    return abig * std::sqrt(Scalar(1) + numext::abs2(asml/abig));
222c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  #endif
223c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
224c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
225c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define BENCH_PERF(NRM) { \
2262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  float af = 0; double ad = 0; std::complex<float> ac = 0; \
227c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  Eigen::BenchTimer tf, td, tcf; tf.reset(); td.reset(); tcf.reset();\
228c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  for (int k=0; k<tries; ++k) { \
229c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    tf.start(); \
2302b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    for (int i=0; i<iters; ++i) { af += NRM(vf); } \
231c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    tf.stop(); \
232c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  } \
233c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  for (int k=0; k<tries; ++k) { \
234c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    td.start(); \
2352b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    for (int i=0; i<iters; ++i) { ad += NRM(vd); } \
236c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    td.stop(); \
237c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  } \
2382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  /*for (int k=0; k<std::max(1,tries/3); ++k) { \
239c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    tcf.start(); \
2402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    for (int i=0; i<iters; ++i) { ac += NRM(vcf); } \
241c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    tcf.stop(); \
2422b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  } */\
243c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << #NRM << "\t" << tf.value() << "   " << td.value() <<  "    " << tcf.value() << "\n"; \
244c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
245c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
246c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathvoid check_accuracy(double basef, double based, int s)
247c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
2482b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  double yf = basef * std::abs(internal::random<double>());
2492b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  double yd = based * std::abs(internal::random<double>());
250c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  VectorXf vf = VectorXf::Ones(s) * yf;
251c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  VectorXd vd = VectorXd::Ones(s) * yd;
252c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
2532b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  std::cout << "reference\t" << std::sqrt(double(s))*yf << "\t" << std::sqrt(double(s))*yd << "\n";
254c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "sqsumNorm\t" << sqsumNorm(vf) << "\t" << sqsumNorm(vd) << "\n";
255c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "hypotNorm\t" << hypotNorm(vf) << "\t" << hypotNorm(vd) << "\n";
256c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "blueNorm\t" << blueNorm(vf) << "\t" << blueNorm(vd) << "\n";
257c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "pblueNorm\t" << pblueNorm(vf) << "\t" << pblueNorm(vd) << "\n";
258c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "lapackNorm\t" << lapackNorm(vf) << "\t" << lapackNorm(vd) << "\n";
259c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "twopassNorm\t" << twopassNorm(vf) << "\t" << twopassNorm(vd) << "\n";
260c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "bl2passNorm\t" << bl2passNorm(vf) << "\t" << bl2passNorm(vd) << "\n";
261c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
262c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
263c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathvoid check_accuracy_var(int ef0, int ef1, int ed0, int ed1, int s)
264c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
265c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  VectorXf vf(s);
266c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  VectorXd vd(s);
267c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  for (int i=0; i<s; ++i)
268c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  {
2692b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    vf[i] = std::abs(internal::random<double>()) * std::pow(double(10), internal::random<int>(ef0,ef1));
2702b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    vd[i] = std::abs(internal::random<double>()) * std::pow(double(10), internal::random<int>(ed0,ed1));
271c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  }
272c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
273c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  //std::cout << "reference\t" << internal::sqrt(double(s))*yf << "\t" << internal::sqrt(double(s))*yd << "\n";
274c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "sqsumNorm\t"  << sqsumNorm(vf)  << "\t" << sqsumNorm(vd)  << "\t" << sqsumNorm(vf.cast<long double>()) << "\t" << sqsumNorm(vd.cast<long double>()) << "\n";
275c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "hypotNorm\t"  << hypotNorm(vf)  << "\t" << hypotNorm(vd)  << "\t" << hypotNorm(vf.cast<long double>()) << "\t" << hypotNorm(vd.cast<long double>()) << "\n";
276c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "blueNorm\t"   << blueNorm(vf)   << "\t" << blueNorm(vd)   << "\t" << blueNorm(vf.cast<long double>()) << "\t" << blueNorm(vd.cast<long double>()) << "\n";
277c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "pblueNorm\t"  << pblueNorm(vf)  << "\t" << pblueNorm(vd)  << "\t" << blueNorm(vf.cast<long double>()) << "\t" << blueNorm(vd.cast<long double>()) << "\n";
278c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "lapackNorm\t" << lapackNorm(vf) << "\t" << lapackNorm(vd) << "\t" << lapackNorm(vf.cast<long double>()) << "\t" << lapackNorm(vd.cast<long double>()) << "\n";
279c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "twopassNorm\t" << twopassNorm(vf) << "\t" << twopassNorm(vd) << "\t" << twopassNorm(vf.cast<long double>()) << "\t" << twopassNorm(vd.cast<long double>()) << "\n";
280c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//   std::cout << "bl2passNorm\t" << bl2passNorm(vf) << "\t" << bl2passNorm(vd) << "\t" << bl2passNorm(vf.cast<long double>()) << "\t" << bl2passNorm(vd.cast<long double>()) << "\n";
281c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
282c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
283c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathint main(int argc, char** argv)
284c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
285c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int tries = 10;
286c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int iters = 100000;
287c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  double y = 1.1345743233455785456788e12 * internal::random<double>();
288c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  VectorXf v = VectorXf::Ones(1024) * y;
289c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
290c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// return 0;
291c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int s = 10000;
292c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  double basef_ok = 1.1345743233455785456788e15;
293c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  double based_ok = 1.1345743233455785456788e95;
294c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
295c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  double basef_under = 1.1345743233455785456788e-27;
296c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  double based_under = 1.1345743233455785456788e-303;
297c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
298c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  double basef_over = 1.1345743233455785456788e+27;
299c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  double based_over = 1.1345743233455785456788e+302;
300c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
301c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout.precision(20);
302c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
303c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cerr << "\nNo under/overflow:\n";
304c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  check_accuracy(basef_ok, based_ok, s);
305c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
306c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cerr << "\nUnderflow:\n";
307c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  check_accuracy(basef_under, based_under, s);
308c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
309c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cerr << "\nOverflow:\n";
310c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  check_accuracy(basef_over, based_over, s);
311c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
312c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cerr << "\nVarying (over):\n";
313c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  for (int k=0; k<1; ++k)
314c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  {
315c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    check_accuracy_var(20,27,190,302,s);
316c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    std::cout << "\n";
317c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  }
318c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
319c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cerr << "\nVarying (under):\n";
320c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  for (int k=0; k<1; ++k)
321c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  {
322c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    check_accuracy_var(-27,20,-302,-190,s);
323c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    std::cout << "\n";
324c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  }
325c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
3262b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  y = 1;
327c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout.precision(4);
3282b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  int s1 = 1024*1024*32;
3292b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  std::cerr << "Performance (out of cache, " << s1 << "):\n";
330c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  {
331c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    int iters = 1;
3322b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    VectorXf vf = VectorXf::Random(s1) * y;
3332b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    VectorXd vd = VectorXd::Random(s1) * y;
3342b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    VectorXcf vcf = VectorXcf::Random(s1) * y;
335c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    BENCH_PERF(sqsumNorm);
3362b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    BENCH_PERF(stableNorm);
337c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    BENCH_PERF(blueNorm);
3382b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    BENCH_PERF(pblueNorm);
3392b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    BENCH_PERF(lapackNorm);
3402b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    BENCH_PERF(hypotNorm);
3412b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    BENCH_PERF(twopassNorm);
342c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    BENCH_PERF(bl2passNorm);
343c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  }
344c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
3452b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang  std::cerr << "\nPerformance (in cache, " << 512 << "):\n";
346c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  {
347c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    int iters = 100000;
348c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    VectorXf vf = VectorXf::Random(512) * y;
349c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    VectorXd vd = VectorXd::Random(512) * y;
350c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    VectorXcf vcf = VectorXcf::Random(512) * y;
351c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    BENCH_PERF(sqsumNorm);
3522b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    BENCH_PERF(stableNorm);
353c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    BENCH_PERF(blueNorm);
3542b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    BENCH_PERF(pblueNorm);
3552b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    BENCH_PERF(lapackNorm);
3562b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    BENCH_PERF(hypotNorm);
3572b8756b6f1de65d3f8bffab45be6c44ceb7411fcMiao Wang    BENCH_PERF(twopassNorm);
358c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    BENCH_PERF(bl2passNorm);
359c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  }
360c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
361