1
2// g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2  ./a.out
3// icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp  && OMP_NUM_THREADS=2  ./a.out
4
5#include <iostream>
6#include <Eigen/Core>
7#include <bench/BenchTimer.h>
8
9using namespace std;
10using namespace Eigen;
11
12#ifndef SCALAR
13// #define SCALAR std::complex<float>
14#define SCALAR float
15#endif
16
17typedef SCALAR Scalar;
18typedef NumTraits<Scalar>::Real RealScalar;
19typedef Matrix<RealScalar,Dynamic,Dynamic> A;
20typedef Matrix</*Real*/Scalar,Dynamic,Dynamic> B;
21typedef Matrix<Scalar,Dynamic,Dynamic> C;
22typedef Matrix<RealScalar,Dynamic,Dynamic> M;
23
24#ifdef HAVE_BLAS
25
26extern "C" {
27  #include <Eigen/src/misc/blas.h>
28}
29
30static float fone = 1;
31static float fzero = 0;
32static double done = 1;
33static double szero = 0;
34static std::complex<float> cfone = 1;
35static std::complex<float> cfzero = 0;
36static std::complex<double> cdone = 1;
37static std::complex<double> cdzero = 0;
38static char notrans = 'N';
39static char trans = 'T';
40static char nonunit = 'N';
41static char lower = 'L';
42static char right = 'R';
43static int intone = 1;
44
45void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c)
46{
47  int M = c.rows(); int N = c.cols(); int K = a.cols();
48  int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows();
49
50  sgemm_(&notrans,&notrans,&M,&N,&K,&fone,
51         const_cast<float*>(a.data()),&lda,
52         const_cast<float*>(b.data()),&ldb,&fone,
53         c.data(),&ldc);
54}
55
56EIGEN_DONT_INLINE void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c)
57{
58  int M = c.rows(); int N = c.cols(); int K = a.cols();
59  int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows();
60
61  dgemm_(&notrans,&notrans,&M,&N,&K,&done,
62         const_cast<double*>(a.data()),&lda,
63         const_cast<double*>(b.data()),&ldb,&done,
64         c.data(),&ldc);
65}
66
67void blas_gemm(const MatrixXcf& a, const MatrixXcf& b, MatrixXcf& c)
68{
69  int M = c.rows(); int N = c.cols(); int K = a.cols();
70  int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows();
71
72  cgemm_(&notrans,&notrans,&M,&N,&K,(float*)&cfone,
73         const_cast<float*>((const float*)a.data()),&lda,
74         const_cast<float*>((const float*)b.data()),&ldb,(float*)&cfone,
75         (float*)c.data(),&ldc);
76}
77
78void blas_gemm(const MatrixXcd& a, const MatrixXcd& b, MatrixXcd& c)
79{
80  int M = c.rows(); int N = c.cols(); int K = a.cols();
81  int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows();
82
83  zgemm_(&notrans,&notrans,&M,&N,&K,(double*)&cdone,
84         const_cast<double*>((const double*)a.data()),&lda,
85         const_cast<double*>((const double*)b.data()),&ldb,(double*)&cdone,
86         (double*)c.data(),&ldc);
87}
88
89
90
91#endif
92
93void matlab_cplx_cplx(const M& ar, const M& ai, const M& br, const M& bi, M& cr, M& ci)
94{
95  cr.noalias() += ar * br;
96  cr.noalias() -= ai * bi;
97  ci.noalias() += ar * bi;
98  ci.noalias() += ai * br;
99}
100
101void matlab_real_cplx(const M& a, const M& br, const M& bi, M& cr, M& ci)
102{
103  cr.noalias() += a * br;
104  ci.noalias() += a * bi;
105}
106
107void matlab_cplx_real(const M& ar, const M& ai, const M& b, M& cr, M& ci)
108{
109  cr.noalias() += ar * b;
110  ci.noalias() += ai * b;
111}
112
113template<typename A, typename B, typename C>
114EIGEN_DONT_INLINE void gemm(const A& a, const B& b, C& c)
115{
116 c.noalias() += a * b;
117}
118
119int main(int argc, char ** argv)
120{
121  std::ptrdiff_t l1 = internal::queryL1CacheSize();
122  std::ptrdiff_t l2 = internal::queryTopLevelCacheSize();
123  std::cout << "L1 cache size     = " << (l1>0 ? l1/1024 : -1) << " KB\n";
124  std::cout << "L2/L3 cache size  = " << (l2>0 ? l2/1024 : -1) << " KB\n";
125  typedef internal::gebp_traits<Scalar,Scalar> Traits;
126  std::cout << "Register blocking = " << Traits::mr << " x " << Traits::nr << "\n";
127
128  int rep = 1;    // number of repetitions per try
129  int tries = 2;  // number of tries, we keep the best
130
131  int s = 2048;
132  int cache_size = -1;
133
134  bool need_help = false;
135  for (int i=1; i<argc; ++i)
136  {
137    if(argv[i][0]=='s')
138      s = atoi(argv[i]+1);
139    else if(argv[i][0]=='c')
140      cache_size = atoi(argv[i]+1);
141    else if(argv[i][0]=='t')
142      tries = atoi(argv[i]+1);
143    else if(argv[i][0]=='p')
144      rep = atoi(argv[i]+1);
145    else
146      need_help = true;
147  }
148
149  if(need_help)
150  {
151    std::cout << argv[0] << " s<matrix size> c<cache size> t<nb tries> p<nb repeats>\n";
152    return 1;
153  }
154
155  if(cache_size>0)
156    setCpuCacheSizes(cache_size,96*cache_size);
157
158  int m = s;
159  int n = s;
160  int p = s;
161  A a(m,p); a.setRandom();
162  B b(p,n); b.setRandom();
163  C c(m,n); c.setOnes();
164  C rc = c;
165
166  std::cout << "Matrix sizes = " << m << "x" << p << " * " << p << "x" << n << "\n";
167  std::ptrdiff_t mc(m), nc(n), kc(p);
168  internal::computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
169  std::cout << "blocking size (mc x kc) = " << mc << " x " << kc << "\n";
170
171  C r = c;
172
173  // check the parallel product is correct
174  #if defined EIGEN_HAS_OPENMP
175  int procs = omp_get_max_threads();
176  if(procs>1)
177  {
178    #ifdef HAVE_BLAS
179    blas_gemm(a,b,r);
180    #else
181    omp_set_num_threads(1);
182    r.noalias() += a * b;
183    omp_set_num_threads(procs);
184    #endif
185    c.noalias() += a * b;
186    if(!r.isApprox(c)) std::cerr << "Warning, your parallel product is crap!\n\n";
187  }
188  #elif defined HAVE_BLAS
189    blas_gemm(a,b,r);
190    c.noalias() += a * b;
191    if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n";
192  #else
193    gemm(a,b,c);
194    r.noalias() += a.cast<Scalar>() * b.cast<Scalar>();
195    if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n";
196  #endif
197
198  #ifdef HAVE_BLAS
199  BenchTimer tblas;
200  c = rc;
201  BENCH(tblas, tries, rep, blas_gemm(a,b,c));
202  std::cout << "blas  cpu         " << tblas.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/tblas.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << tblas.total(CPU_TIMER)  << "s)\n";
203  std::cout << "blas  real        " << tblas.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/tblas.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << tblas.total(REAL_TIMER) << "s)\n";
204  #endif
205
206  BenchTimer tmt;
207  c = rc;
208  BENCH(tmt, tries, rep, gemm(a,b,c));
209  std::cout << "eigen cpu         " << tmt.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << tmt.total(CPU_TIMER)  << "s)\n";
210  std::cout << "eigen real        " << tmt.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n";
211
212  #ifdef EIGEN_HAS_OPENMP
213  if(procs>1)
214  {
215    BenchTimer tmono;
216    omp_set_num_threads(1);
217    Eigen::internal::setNbThreads(1);
218    c = rc;
219    BENCH(tmono, tries, rep, gemm(a,b,c));
220    std::cout << "eigen mono cpu    " << tmono.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/tmono.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << tmono.total(CPU_TIMER)  << "s)\n";
221    std::cout << "eigen mono real   " << tmono.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/tmono.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << tmono.total(REAL_TIMER) << "s)\n";
222    std::cout << "mt speed up x" << tmono.best(CPU_TIMER) / tmt.best(REAL_TIMER)  << " => " << (100.0*tmono.best(CPU_TIMER) / tmt.best(REAL_TIMER))/procs << "%\n";
223  }
224  #endif
225
226  #ifdef DECOUPLED
227  if((NumTraits<A::Scalar>::IsComplex) && (NumTraits<B::Scalar>::IsComplex))
228  {
229    M ar(m,p); ar.setRandom();
230    M ai(m,p); ai.setRandom();
231    M br(p,n); br.setRandom();
232    M bi(p,n); bi.setRandom();
233    M cr(m,n); cr.setRandom();
234    M ci(m,n); ci.setRandom();
235
236    BenchTimer t;
237    BENCH(t, tries, rep, matlab_cplx_cplx(ar,ai,br,bi,cr,ci));
238    std::cout << "\"matlab\" cpu    " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
239    std::cout << "\"matlab\" real   " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
240  }
241  if((!NumTraits<A::Scalar>::IsComplex) && (NumTraits<B::Scalar>::IsComplex))
242  {
243    M a(m,p);  a.setRandom();
244    M br(p,n); br.setRandom();
245    M bi(p,n); bi.setRandom();
246    M cr(m,n); cr.setRandom();
247    M ci(m,n); ci.setRandom();
248
249    BenchTimer t;
250    BENCH(t, tries, rep, matlab_real_cplx(a,br,bi,cr,ci));
251    std::cout << "\"matlab\" cpu    " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
252    std::cout << "\"matlab\" real   " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
253  }
254  if((NumTraits<A::Scalar>::IsComplex) && (!NumTraits<B::Scalar>::IsComplex))
255  {
256    M ar(m,p); ar.setRandom();
257    M ai(m,p); ai.setRandom();
258    M b(p,n);  b.setRandom();
259    M cr(m,n); cr.setRandom();
260    M ci(m,n); ci.setRandom();
261
262    BenchTimer t;
263    BENCH(t, tries, rep, matlab_cplx_real(ar,ai,b,cr,ci));
264    std::cout << "\"matlab\" cpu    " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
265    std::cout << "\"matlab\" real   " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
266  }
267  #endif
268
269  return 0;
270}
271
272