1c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
2c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#include <iostream>
3c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#include <Eigen/Core>
4c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#include <bench/BenchTimer.h>
5c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathusing namespace Eigen;
6c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
7c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef SIZE
8c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define SIZE 50
9c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif
10c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
11c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef REPEAT
12c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define REPEAT 10000
13c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif
14c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
15c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef float Scalar;
16c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
17c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath__attribute__ ((noinline)) void benchVec(Scalar* a, Scalar* b, Scalar* c, int size);
18c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath__attribute__ ((noinline)) void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c);
19c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath__attribute__ ((noinline)) void benchVec(VectorXf& a, VectorXf& b, VectorXf& c);
20c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
21c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathint main(int argc, char* argv[])
22c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
23c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    int size = SIZE * 8;
24c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    int size2 = size * size;
25c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    Scalar* a = internal::aligned_new<Scalar>(size2);
26c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    Scalar* b = internal::aligned_new<Scalar>(size2+4)+1;
27c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    Scalar* c = internal::aligned_new<Scalar>(size2);
28c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
29c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    for (int i=0; i<size; ++i)
30c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    {
31c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath        a[i] = b[i] = c[i] = 0;
32c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    }
33c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
34c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    BenchTimer timer;
35c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
36c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    timer.reset();
37c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    for (int k=0; k<10; ++k)
38c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    {
39c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath        timer.start();
40c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath        benchVec(a, b, c, size2);
41c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath        timer.stop();
42c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    }
43c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    std::cout << timer.value() << "s  " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
44c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    return 0;
45c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    for (int innersize = size; innersize>2 ; --innersize)
46c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    {
47c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath        if (size2%innersize==0)
48c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath        {
49c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath            int outersize = size2/innersize;
50c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath            MatrixXf ma = Map<MatrixXf>(a, innersize, outersize );
51c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath            MatrixXf mb = Map<MatrixXf>(b, innersize, outersize );
52c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath            MatrixXf mc = Map<MatrixXf>(c, innersize, outersize );
53c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath            timer.reset();
54c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath            for (int k=0; k<3; ++k)
55c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath            {
56c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath                timer.start();
57c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath                benchVec(ma, mb, mc);
58c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath                timer.stop();
59c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath            }
60c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath            std::cout << innersize << " x " << outersize << "  " << timer.value() << "s   " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
61c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath        }
62c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    }
63c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
64c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    VectorXf va = Map<VectorXf>(a, size2);
65c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    VectorXf vb = Map<VectorXf>(b, size2);
66c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    VectorXf vc = Map<VectorXf>(c, size2);
67c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    timer.reset();
68c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    for (int k=0; k<3; ++k)
69c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    {
70c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath        timer.start();
71c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath        benchVec(va, vb, vc);
72c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath        timer.stop();
73c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    }
74c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    std::cout << timer.value() << "s   " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
75c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
76c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    return 0;
77c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
78c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
79c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathvoid benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c)
80c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
81c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    for (int k=0; k<REPEAT; ++k)
82c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath        a = a + b;
83c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
84c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
85c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathvoid benchVec(VectorXf& a, VectorXf& b, VectorXf& c)
86c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
87c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    for (int k=0; k<REPEAT; ++k)
88c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath        a = a + b;
89c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
90c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
91c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathvoid benchVec(Scalar* a, Scalar* b, Scalar* c, int size)
92c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{
93c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    typedef internal::packet_traits<Scalar>::type PacketScalar;
94c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    const int PacketSize = internal::packet_traits<Scalar>::size;
95c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    PacketScalar a0, a1, a2, a3, b0, b1, b2, b3;
96c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath    for (int k=0; k<REPEAT; ++k)
97c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath        for (int i=0; i<size; i+=PacketSize*8)
98c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath        {
99c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             a0 = internal::pload(&a[i]);
100c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             b0 = internal::pload(&b[i]);
101c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             a1 = internal::pload(&a[i+1*PacketSize]);
102c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             b1 = internal::pload(&b[i+1*PacketSize]);
103c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             a2 = internal::pload(&a[i+2*PacketSize]);
104c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             b2 = internal::pload(&b[i+2*PacketSize]);
105c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             a3 = internal::pload(&a[i+3*PacketSize]);
106c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             b3 = internal::pload(&b[i+3*PacketSize]);
107c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             internal::pstore(&a[i], internal::padd(a0, b0));
108c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             a0 = internal::pload(&a[i+4*PacketSize]);
109c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             b0 = internal::pload(&b[i+4*PacketSize]);
110c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//
111c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             internal::pstore(&a[i+1*PacketSize], internal::padd(a1, b1));
112c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             a1 = internal::pload(&a[i+5*PacketSize]);
113c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             b1 = internal::pload(&b[i+5*PacketSize]);
114c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//
115c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             internal::pstore(&a[i+2*PacketSize], internal::padd(a2, b2));
116c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             a2 = internal::pload(&a[i+6*PacketSize]);
117c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             b2 = internal::pload(&b[i+6*PacketSize]);
118c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//
119c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             internal::pstore(&a[i+3*PacketSize], internal::padd(a3, b3));
120c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             a3 = internal::pload(&a[i+7*PacketSize]);
121c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             b3 = internal::pload(&b[i+7*PacketSize]);
122c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//
123c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             internal::pstore(&a[i+4*PacketSize], internal::padd(a0, b0));
124c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             internal::pstore(&a[i+5*PacketSize], internal::padd(a1, b1));
125c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             internal::pstore(&a[i+6*PacketSize], internal::padd(a2, b2));
126c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath//             internal::pstore(&a[i+7*PacketSize], internal::padd(a3, b3));
127c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath
128c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath            internal::pstore(&a[i+2*PacketSize], internal::padd(internal::ploadu(&a[i+2*PacketSize]), internal::ploadu(&b[i+2*PacketSize])));
129c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath            internal::pstore(&a[i+3*PacketSize], internal::padd(internal::ploadu(&a[i+3*PacketSize]), internal::ploadu(&b[i+3*PacketSize])));
130c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath            internal::pstore(&a[i+4*PacketSize], internal::padd(internal::ploadu(&a[i+4*PacketSize]), internal::ploadu(&b[i+4*PacketSize])));
131c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath            internal::pstore(&a[i+5*PacketSize], internal::padd(internal::ploadu(&a[i+5*PacketSize]), internal::ploadu(&b[i+5*PacketSize])));
132c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath            internal::pstore(&a[i+6*PacketSize], internal::padd(internal::ploadu(&a[i+6*PacketSize]), internal::ploadu(&b[i+6*PacketSize])));
133c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath            internal::pstore(&a[i+7*PacketSize], internal::padd(internal::ploadu(&a[i+7*PacketSize]), internal::ploadu(&b[i+7*PacketSize])));
134c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath        }
135c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}
136