1c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 2c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#include <iostream> 3c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#include <Eigen/Core> 4c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#include <bench/BenchTimer.h> 5c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathusing namespace Eigen; 6c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 7c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef SIZE 8c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define SIZE 50 9c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif 10c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 11c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef REPEAT 12c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define REPEAT 10000 13c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif 14c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 15c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef float Scalar; 16c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 17c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath__attribute__ ((noinline)) void benchVec(Scalar* a, Scalar* b, Scalar* c, int size); 18c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath__attribute__ ((noinline)) void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c); 19c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath__attribute__ ((noinline)) void benchVec(VectorXf& a, VectorXf& b, VectorXf& c); 20c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 21c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathint main(int argc, char* argv[]) 22c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 23c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int size = SIZE * 8; 24c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int size2 = size * size; 25c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar* a = internal::aligned_new<Scalar>(size2); 26c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar* b = internal::aligned_new<Scalar>(size2+4)+1; 27c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar* c = internal::aligned_new<Scalar>(size2); 28c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 29c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for (int i=0; i<size; ++i) 30c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 31c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a[i] = b[i] = c[i] = 0; 32c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 33c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 34c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath BenchTimer timer; 35c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 36c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath timer.reset(); 37c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for (int k=0; k<10; ++k) 38c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 39c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath timer.start(); 40c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath benchVec(a, b, c, size2); 41c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath timer.stop(); 42c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 43c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath std::cout << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n"; 44c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return 0; 45c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for (int innersize = size; innersize>2 ; --innersize) 46c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 47c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if (size2%innersize==0) 48c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 49c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath int outersize = size2/innersize; 50c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath MatrixXf ma = Map<MatrixXf>(a, innersize, outersize ); 51c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath MatrixXf mb = Map<MatrixXf>(b, innersize, outersize ); 52c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath MatrixXf mc = Map<MatrixXf>(c, innersize, outersize ); 53c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath timer.reset(); 54c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for (int k=0; k<3; ++k) 55c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 56c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath timer.start(); 57c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath benchVec(ma, mb, mc); 58c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath timer.stop(); 59c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 60c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath std::cout << innersize << " x " << outersize << " " << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n"; 61c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 62c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 63c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 64c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath VectorXf va = Map<VectorXf>(a, size2); 65c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath VectorXf vb = Map<VectorXf>(b, size2); 66c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath VectorXf vc = Map<VectorXf>(c, size2); 67c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath timer.reset(); 68c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for (int k=0; k<3; ++k) 69c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 70c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath timer.start(); 71c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath benchVec(va, vb, vc); 72c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath timer.stop(); 73c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 74c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath std::cout << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n"; 75c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 76c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath return 0; 77c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 78c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 79c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathvoid benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c) 80c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 81c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for (int k=0; k<REPEAT; ++k) 82c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a = a + b; 83c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 84c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 85c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathvoid benchVec(VectorXf& a, VectorXf& b, VectorXf& c) 86c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 87c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for (int k=0; k<REPEAT; ++k) 88c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath a = a + b; 89c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 90c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 91c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathvoid benchVec(Scalar* a, Scalar* b, Scalar* c, int size) 92c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 93c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef internal::packet_traits<Scalar>::type PacketScalar; 94c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const int PacketSize = internal::packet_traits<Scalar>::size; 95c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath PacketScalar a0, a1, a2, a3, b0, b1, b2, b3; 96c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for (int k=0; k<REPEAT; ++k) 97c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for (int i=0; i<size; i+=PacketSize*8) 98c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 99c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// a0 = internal::pload(&a[i]); 100c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// b0 = internal::pload(&b[i]); 101c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// a1 = internal::pload(&a[i+1*PacketSize]); 102c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// b1 = internal::pload(&b[i+1*PacketSize]); 103c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// a2 = internal::pload(&a[i+2*PacketSize]); 104c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// b2 = internal::pload(&b[i+2*PacketSize]); 105c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// a3 = internal::pload(&a[i+3*PacketSize]); 106c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// b3 = internal::pload(&b[i+3*PacketSize]); 107c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// internal::pstore(&a[i], internal::padd(a0, b0)); 108c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// a0 = internal::pload(&a[i+4*PacketSize]); 109c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// b0 = internal::pload(&b[i+4*PacketSize]); 110c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// 111c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// internal::pstore(&a[i+1*PacketSize], internal::padd(a1, b1)); 112c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// a1 = internal::pload(&a[i+5*PacketSize]); 113c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// b1 = internal::pload(&b[i+5*PacketSize]); 114c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// 115c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// internal::pstore(&a[i+2*PacketSize], internal::padd(a2, b2)); 116c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// a2 = internal::pload(&a[i+6*PacketSize]); 117c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// b2 = internal::pload(&b[i+6*PacketSize]); 118c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// 119c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// internal::pstore(&a[i+3*PacketSize], internal::padd(a3, b3)); 120c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// a3 = internal::pload(&a[i+7*PacketSize]); 121c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// b3 = internal::pload(&b[i+7*PacketSize]); 122c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// 123c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// internal::pstore(&a[i+4*PacketSize], internal::padd(a0, b0)); 124c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// internal::pstore(&a[i+5*PacketSize], internal::padd(a1, b1)); 125c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// internal::pstore(&a[i+6*PacketSize], internal::padd(a2, b2)); 126c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// internal::pstore(&a[i+7*PacketSize], internal::padd(a3, b3)); 127c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 128c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath internal::pstore(&a[i+2*PacketSize], internal::padd(internal::ploadu(&a[i+2*PacketSize]), internal::ploadu(&b[i+2*PacketSize]))); 129c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath internal::pstore(&a[i+3*PacketSize], internal::padd(internal::ploadu(&a[i+3*PacketSize]), internal::ploadu(&b[i+3*PacketSize]))); 130c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath internal::pstore(&a[i+4*PacketSize], internal::padd(internal::ploadu(&a[i+4*PacketSize]), internal::ploadu(&b[i+4*PacketSize]))); 131c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath internal::pstore(&a[i+5*PacketSize], internal::padd(internal::ploadu(&a[i+5*PacketSize]), internal::ploadu(&b[i+5*PacketSize]))); 132c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath internal::pstore(&a[i+6*PacketSize], internal::padd(internal::ploadu(&a[i+6*PacketSize]), internal::ploadu(&b[i+6*PacketSize]))); 133c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath internal::pstore(&a[i+7*PacketSize], internal::padd(internal::ploadu(&a[i+7*PacketSize]), internal::ploadu(&b[i+7*PacketSize]))); 134c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 135c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 136