1c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
2c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2  ./a.out
3c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp  && OMP_NUM_THREADS=2  ./a.out
4c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
5c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#include <iostream>
6c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#include <Eigen/Core>
7c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#include <bench/BenchTimer.h>
8c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
9c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathusing namespace std;
10c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathusing namespace Eigen;
11c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
12c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef SCALAR
13c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// #define SCALAR std::complex<float>
14c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define SCALAR float
15c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif
16c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
17c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef SCALAR Scalar;
18c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef NumTraits<Scalar>::Real RealScalar;
19c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef Matrix<RealScalar,Dynamic,Dynamic> A;
20c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef Matrix</*Real*/Scalar,Dynamic,Dynamic> B;
21c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef Matrix<Scalar,Dynamic,Dynamic> C;
22c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef Matrix<RealScalar,Dynamic,Dynamic> M;
23c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
24c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifdef HAVE_BLAS
25c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
26c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathextern "C" {
277faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  #include <Eigen/src/misc/blas.h>
28c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
29c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
30c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic float fone = 1;
31c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic float fzero = 0;
32c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic double done = 1;
33c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic double szero = 0;
34c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic std::complex<float> cfone = 1;
35c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic std::complex<float> cfzero = 0;
36c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic std::complex<double> cdone = 1;
37c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic std::complex<double> cdzero = 0;
38c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic char notrans = 'N';
39c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic char trans = 'T';
40c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic char nonunit = 'N';
41c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic char lower = 'L';
42c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic char right = 'R';
43c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic int intone = 1;
44c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
45c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathvoid blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c)
46c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
47c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int M = c.rows(); int N = c.cols(); int K = a.cols();
48c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows();
49c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
50c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  sgemm_(&notrans,&notrans,&M,&N,&K,&fone,
51c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath         const_cast<float*>(a.data()),&lda,
52c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath         const_cast<float*>(b.data()),&ldb,&fone,
53c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath         c.data(),&ldc);
54c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
55c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
56c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathEIGEN_DONT_INLINE void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c)
57c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
58c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int M = c.rows(); int N = c.cols(); int K = a.cols();
59c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows();
60c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
61c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  dgemm_(&notrans,&notrans,&M,&N,&K,&done,
62c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath         const_cast<double*>(a.data()),&lda,
63c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath         const_cast<double*>(b.data()),&ldb,&done,
64c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath         c.data(),&ldc);
65c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
66c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
67c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathvoid blas_gemm(const MatrixXcf& a, const MatrixXcf& b, MatrixXcf& c)
68c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
69c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int M = c.rows(); int N = c.cols(); int K = a.cols();
70c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows();
71c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
72c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  cgemm_(&notrans,&notrans,&M,&N,&K,(float*)&cfone,
73c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath         const_cast<float*>((const float*)a.data()),&lda,
74c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath         const_cast<float*>((const float*)b.data()),&ldb,(float*)&cfone,
75c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath         (float*)c.data(),&ldc);
76c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
77c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
78c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathvoid blas_gemm(const MatrixXcd& a, const MatrixXcd& b, MatrixXcd& c)
79c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
80c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int M = c.rows(); int N = c.cols(); int K = a.cols();
81c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows();
82c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
83c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  zgemm_(&notrans,&notrans,&M,&N,&K,(double*)&cdone,
84c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath         const_cast<double*>((const double*)a.data()),&lda,
85c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath         const_cast<double*>((const double*)b.data()),&ldb,(double*)&cdone,
86c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath         (double*)c.data(),&ldc);
87c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
88c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
89c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
90c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
91c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif
92c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
93c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathvoid matlab_cplx_cplx(const M& ar, const M& ai, const M& br, const M& bi, M& cr, M& ci)
94c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
95c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  cr.noalias() += ar * br;
96c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  cr.noalias() -= ai * bi;
97c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  ci.noalias() += ar * bi;
98c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  ci.noalias() += ai * br;
99c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
100c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
101c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathvoid matlab_real_cplx(const M& a, const M& br, const M& bi, M& cr, M& ci)
102c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
103c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  cr.noalias() += a * br;
104c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  ci.noalias() += a * bi;
105c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
106c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
107c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathvoid matlab_cplx_real(const M& ar, const M& ai, const M& b, M& cr, M& ci)
108c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
109c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  cr.noalias() += ar * b;
110c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  ci.noalias() += ai * b;
111c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
112c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
113c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<typename A, typename B, typename C>
114c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan KamathEIGEN_DONT_INLINE void gemm(const A& a, const B& b, C& c)
115c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
116c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath c.noalias() += a * b;
117c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
118c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
119c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathint main(int argc, char ** argv)
120c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
121c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::ptrdiff_t l1 = internal::queryL1CacheSize();
122c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::ptrdiff_t l2 = internal::queryTopLevelCacheSize();
123c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "L1 cache size     = " << (l1>0 ? l1/1024 : -1) << " KB\n";
124c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "L2/L3 cache size  = " << (l2>0 ? l2/1024 : -1) << " KB\n";
125c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  typedef internal::gebp_traits<Scalar,Scalar> Traits;
126c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "Register blocking = " << Traits::mr << " x " << Traits::nr << "\n";
127c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
128c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int rep = 1;    // number of repetitions per try
129c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int tries = 2;  // number of tries, we keep the best
130c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
131c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int s = 2048;
132c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int cache_size = -1;
133c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
134c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  bool need_help = false;
135c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  for (int i=1; i<argc; ++i)
136c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  {
137c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    if(argv[i][0]=='s')
138c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath      s = atoi(argv[i]+1);
139c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    else if(argv[i][0]=='c')
140c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath      cache_size = atoi(argv[i]+1);
141c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    else if(argv[i][0]=='t')
142c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath      tries = atoi(argv[i]+1);
143c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    else if(argv[i][0]=='p')
144c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath      rep = atoi(argv[i]+1);
145c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    else
146c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath      need_help = true;
147c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  }
148c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
149c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  if(need_help)
150c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  {
151c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    std::cout << argv[0] << " s<matrix size> c<cache size> t<nb tries> p<nb repeats>\n";
152c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    return 1;
153c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  }
154c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
155c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  if(cache_size>0)
156c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    setCpuCacheSizes(cache_size,96*cache_size);
157c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
158c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int m = s;
159c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int n = s;
160c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int p = s;
161c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  A a(m,p); a.setRandom();
162c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  B b(p,n); b.setRandom();
163c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  C c(m,n); c.setOnes();
164c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  C rc = c;
165c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
166c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "Matrix sizes = " << m << "x" << p << " * " << p << "x" << n << "\n";
167c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::ptrdiff_t mc(m), nc(n), kc(p);
168c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  internal::computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
169c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "blocking size (mc x kc) = " << mc << " x " << kc << "\n";
170c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
171c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  C r = c;
172c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
173c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  // check the parallel product is correct
174c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  #if defined EIGEN_HAS_OPENMP
175c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  int procs = omp_get_max_threads();
176c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  if(procs>1)
177c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  {
178c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    #ifdef HAVE_BLAS
179c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    blas_gemm(a,b,r);
180c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    #else
181c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    omp_set_num_threads(1);
182c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    r.noalias() += a * b;
183c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    omp_set_num_threads(procs);
184c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    #endif
185c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    c.noalias() += a * b;
186c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    if(!r.isApprox(c)) std::cerr << "Warning, your parallel product is crap!\n\n";
187c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  }
188c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  #elif defined HAVE_BLAS
189c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    blas_gemm(a,b,r);
190c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    c.noalias() += a * b;
191c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n";
192c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  #else
193c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    gemm(a,b,c);
194c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    r.noalias() += a.cast<Scalar>() * b.cast<Scalar>();
195c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n";
196c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  #endif
197c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
198c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  #ifdef HAVE_BLAS
199c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  BenchTimer tblas;
200c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  c = rc;
201c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  BENCH(tblas, tries, rep, blas_gemm(a,b,c));
202c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "blas  cpu         " << tblas.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/tblas.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << tblas.total(CPU_TIMER)  << "s)\n";
203c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "blas  real        " << tblas.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/tblas.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << tblas.total(REAL_TIMER) << "s)\n";
204c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  #endif
205c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
206c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  BenchTimer tmt;
207c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  c = rc;
208c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  BENCH(tmt, tries, rep, gemm(a,b,c));
209c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "eigen cpu         " << tmt.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << tmt.total(CPU_TIMER)  << "s)\n";
210c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  std::cout << "eigen real        " << tmt.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n";
211c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
212c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  #ifdef EIGEN_HAS_OPENMP
213c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  if(procs>1)
214c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  {
215c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    BenchTimer tmono;
216c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    omp_set_num_threads(1);
217c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    Eigen::internal::setNbThreads(1);
218c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    c = rc;
219c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    BENCH(tmono, tries, rep, gemm(a,b,c));
220c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    std::cout << "eigen mono cpu    " << tmono.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/tmono.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << tmono.total(CPU_TIMER)  << "s)\n";
221c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    std::cout << "eigen mono real   " << tmono.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/tmono.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << tmono.total(REAL_TIMER) << "s)\n";
222c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    std::cout << "mt speed up x" << tmono.best(CPU_TIMER) / tmt.best(REAL_TIMER)  << " => " << (100.0*tmono.best(CPU_TIMER) / tmt.best(REAL_TIMER))/procs << "%\n";
223c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  }
224c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  #endif
225c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
226c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  #ifdef DECOUPLED
227c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  if((NumTraits<A::Scalar>::IsComplex) && (NumTraits<B::Scalar>::IsComplex))
228c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  {
229c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    M ar(m,p); ar.setRandom();
230c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    M ai(m,p); ai.setRandom();
231c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    M br(p,n); br.setRandom();
232c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    M bi(p,n); bi.setRandom();
233c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    M cr(m,n); cr.setRandom();
234c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    M ci(m,n); ci.setRandom();
235c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
236c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    BenchTimer t;
237c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    BENCH(t, tries, rep, matlab_cplx_cplx(ar,ai,br,bi,cr,ci));
238c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    std::cout << "\"matlab\" cpu    " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
239c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    std::cout << "\"matlab\" real   " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
240c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  }
241c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  if((!NumTraits<A::Scalar>::IsComplex) && (NumTraits<B::Scalar>::IsComplex))
242c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  {
243c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    M a(m,p);  a.setRandom();
244c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    M br(p,n); br.setRandom();
245c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    M bi(p,n); bi.setRandom();
246c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    M cr(m,n); cr.setRandom();
247c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    M ci(m,n); ci.setRandom();
248c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
249c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    BenchTimer t;
250c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    BENCH(t, tries, rep, matlab_real_cplx(a,br,bi,cr,ci));
251c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    std::cout << "\"matlab\" cpu    " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
252c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    std::cout << "\"matlab\" real   " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
253c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  }
254c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  if((NumTraits<A::Scalar>::IsComplex) && (!NumTraits<B::Scalar>::IsComplex))
255c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  {
256c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    M ar(m,p); ar.setRandom();
257c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    M ai(m,p); ai.setRandom();
258c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    M b(p,n);  b.setRandom();
259c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    M cr(m,n); cr.setRandom();
260c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    M ci(m,n); ci.setRandom();
261c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
262c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    BenchTimer t;
263c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    BENCH(t, tries, rep, matlab_cplx_real(ar,ai,b,cr,ci));
264c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    std::cout << "\"matlab\" cpu    " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
265c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    std::cout << "\"matlab\" real   " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
266c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  }
267c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  #endif
268c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
269c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath  return 0;
270c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
271c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
272