1 2#include <iostream> 3#include <Eigen/Core> 4#include <bench/BenchTimer.h> 5using namespace Eigen; 6 7#ifndef SIZE 8#define SIZE 50 9#endif 10 11#ifndef REPEAT 12#define REPEAT 10000 13#endif 14 15typedef float Scalar; 16 17__attribute__ ((noinline)) void benchVec(Scalar* a, Scalar* b, Scalar* c, int size); 18__attribute__ ((noinline)) void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c); 19__attribute__ ((noinline)) void benchVec(VectorXf& a, VectorXf& b, VectorXf& c); 20 21int main(int argc, char* argv[]) 22{ 23 int size = SIZE * 8; 24 int size2 = size * size; 25 Scalar* a = internal::aligned_new<Scalar>(size2); 26 Scalar* b = internal::aligned_new<Scalar>(size2+4)+1; 27 Scalar* c = internal::aligned_new<Scalar>(size2); 28 29 for (int i=0; i<size; ++i) 30 { 31 a[i] = b[i] = c[i] = 0; 32 } 33 34 BenchTimer timer; 35 36 timer.reset(); 37 for (int k=0; k<10; ++k) 38 { 39 timer.start(); 40 benchVec(a, b, c, size2); 41 timer.stop(); 42 } 43 std::cout << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n"; 44 return 0; 45 for (int innersize = size; innersize>2 ; --innersize) 46 { 47 if (size2%innersize==0) 48 { 49 int outersize = size2/innersize; 50 MatrixXf ma = Map<MatrixXf>(a, innersize, outersize ); 51 MatrixXf mb = Map<MatrixXf>(b, innersize, outersize ); 52 MatrixXf mc = Map<MatrixXf>(c, innersize, outersize ); 53 timer.reset(); 54 for (int k=0; k<3; ++k) 55 { 56 timer.start(); 57 benchVec(ma, mb, mc); 58 timer.stop(); 59 } 60 std::cout << innersize << " x " << outersize << " " << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n"; 61 } 62 } 63 64 VectorXf va = Map<VectorXf>(a, size2); 65 VectorXf vb = Map<VectorXf>(b, size2); 66 VectorXf vc = Map<VectorXf>(c, size2); 67 timer.reset(); 68 for (int k=0; k<3; ++k) 69 { 70 timer.start(); 71 benchVec(va, vb, vc); 72 timer.stop(); 73 } 74 std::cout << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n"; 75 76 return 0; 77} 78 79void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c) 80{ 81 for (int k=0; k<REPEAT; ++k) 82 a = a + b; 83} 84 85void benchVec(VectorXf& a, VectorXf& b, VectorXf& c) 86{ 87 for (int k=0; k<REPEAT; ++k) 88 a = a + b; 89} 90 91void benchVec(Scalar* a, Scalar* b, Scalar* c, int size) 92{ 93 typedef internal::packet_traits<Scalar>::type PacketScalar; 94 const int PacketSize = internal::packet_traits<Scalar>::size; 95 PacketScalar a0, a1, a2, a3, b0, b1, b2, b3; 96 for (int k=0; k<REPEAT; ++k) 97 for (int i=0; i<size; i+=PacketSize*8) 98 { 99// a0 = internal::pload(&a[i]); 100// b0 = internal::pload(&b[i]); 101// a1 = internal::pload(&a[i+1*PacketSize]); 102// b1 = internal::pload(&b[i+1*PacketSize]); 103// a2 = internal::pload(&a[i+2*PacketSize]); 104// b2 = internal::pload(&b[i+2*PacketSize]); 105// a3 = internal::pload(&a[i+3*PacketSize]); 106// b3 = internal::pload(&b[i+3*PacketSize]); 107// internal::pstore(&a[i], internal::padd(a0, b0)); 108// a0 = internal::pload(&a[i+4*PacketSize]); 109// b0 = internal::pload(&b[i+4*PacketSize]); 110// 111// internal::pstore(&a[i+1*PacketSize], internal::padd(a1, b1)); 112// a1 = internal::pload(&a[i+5*PacketSize]); 113// b1 = internal::pload(&b[i+5*PacketSize]); 114// 115// internal::pstore(&a[i+2*PacketSize], internal::padd(a2, b2)); 116// a2 = internal::pload(&a[i+6*PacketSize]); 117// b2 = internal::pload(&b[i+6*PacketSize]); 118// 119// internal::pstore(&a[i+3*PacketSize], internal::padd(a3, b3)); 120// a3 = internal::pload(&a[i+7*PacketSize]); 121// b3 = internal::pload(&b[i+7*PacketSize]); 122// 123// internal::pstore(&a[i+4*PacketSize], internal::padd(a0, b0)); 124// internal::pstore(&a[i+5*PacketSize], internal::padd(a1, b1)); 125// internal::pstore(&a[i+6*PacketSize], internal::padd(a2, b2)); 126// internal::pstore(&a[i+7*PacketSize], internal::padd(a3, b3)); 127 128 internal::pstore(&a[i+2*PacketSize], internal::padd(internal::ploadu(&a[i+2*PacketSize]), internal::ploadu(&b[i+2*PacketSize]))); 129 internal::pstore(&a[i+3*PacketSize], internal::padd(internal::ploadu(&a[i+3*PacketSize]), internal::ploadu(&b[i+3*PacketSize]))); 130 internal::pstore(&a[i+4*PacketSize], internal::padd(internal::ploadu(&a[i+4*PacketSize]), internal::ploadu(&b[i+4*PacketSize]))); 131 internal::pstore(&a[i+5*PacketSize], internal::padd(internal::ploadu(&a[i+5*PacketSize]), internal::ploadu(&b[i+5*PacketSize]))); 132 internal::pstore(&a[i+6*PacketSize], internal::padd(internal::ploadu(&a[i+6*PacketSize]), internal::ploadu(&b[i+6*PacketSize]))); 133 internal::pstore(&a[i+7*PacketSize], internal::padd(internal::ploadu(&a[i+7*PacketSize]), internal::ploadu(&b[i+7*PacketSize]))); 134 } 135} 136