1
2#include <iostream>
3#include <Eigen/Core>
4#include <bench/BenchTimer.h>
5using namespace Eigen;
6
7#ifndef SIZE
8#define SIZE 50
9#endif
10
11#ifndef REPEAT
12#define REPEAT 10000
13#endif
14
15typedef float Scalar;
16
17__attribute__ ((noinline)) void benchVec(Scalar* a, Scalar* b, Scalar* c, int size);
18__attribute__ ((noinline)) void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c);
19__attribute__ ((noinline)) void benchVec(VectorXf& a, VectorXf& b, VectorXf& c);
20
21int main(int argc, char* argv[])
22{
23    int size = SIZE * 8;
24    int size2 = size * size;
25    Scalar* a = internal::aligned_new<Scalar>(size2);
26    Scalar* b = internal::aligned_new<Scalar>(size2+4)+1;
27    Scalar* c = internal::aligned_new<Scalar>(size2);
28
29    for (int i=0; i<size; ++i)
30    {
31        a[i] = b[i] = c[i] = 0;
32    }
33
34    BenchTimer timer;
35
36    timer.reset();
37    for (int k=0; k<10; ++k)
38    {
39        timer.start();
40        benchVec(a, b, c, size2);
41        timer.stop();
42    }
43    std::cout << timer.value() << "s  " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
44    return 0;
45    for (int innersize = size; innersize>2 ; --innersize)
46    {
47        if (size2%innersize==0)
48        {
49            int outersize = size2/innersize;
50            MatrixXf ma = Map<MatrixXf>(a, innersize, outersize );
51            MatrixXf mb = Map<MatrixXf>(b, innersize, outersize );
52            MatrixXf mc = Map<MatrixXf>(c, innersize, outersize );
53            timer.reset();
54            for (int k=0; k<3; ++k)
55            {
56                timer.start();
57                benchVec(ma, mb, mc);
58                timer.stop();
59            }
60            std::cout << innersize << " x " << outersize << "  " << timer.value() << "s   " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
61        }
62    }
63
64    VectorXf va = Map<VectorXf>(a, size2);
65    VectorXf vb = Map<VectorXf>(b, size2);
66    VectorXf vc = Map<VectorXf>(c, size2);
67    timer.reset();
68    for (int k=0; k<3; ++k)
69    {
70        timer.start();
71        benchVec(va, vb, vc);
72        timer.stop();
73    }
74    std::cout << timer.value() << "s   " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
75
76    return 0;
77}
78
79void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c)
80{
81    for (int k=0; k<REPEAT; ++k)
82        a = a + b;
83}
84
85void benchVec(VectorXf& a, VectorXf& b, VectorXf& c)
86{
87    for (int k=0; k<REPEAT; ++k)
88        a = a + b;
89}
90
91void benchVec(Scalar* a, Scalar* b, Scalar* c, int size)
92{
93    typedef internal::packet_traits<Scalar>::type PacketScalar;
94    const int PacketSize = internal::packet_traits<Scalar>::size;
95    PacketScalar a0, a1, a2, a3, b0, b1, b2, b3;
96    for (int k=0; k<REPEAT; ++k)
97        for (int i=0; i<size; i+=PacketSize*8)
98        {
99//             a0 = internal::pload(&a[i]);
100//             b0 = internal::pload(&b[i]);
101//             a1 = internal::pload(&a[i+1*PacketSize]);
102//             b1 = internal::pload(&b[i+1*PacketSize]);
103//             a2 = internal::pload(&a[i+2*PacketSize]);
104//             b2 = internal::pload(&b[i+2*PacketSize]);
105//             a3 = internal::pload(&a[i+3*PacketSize]);
106//             b3 = internal::pload(&b[i+3*PacketSize]);
107//             internal::pstore(&a[i], internal::padd(a0, b0));
108//             a0 = internal::pload(&a[i+4*PacketSize]);
109//             b0 = internal::pload(&b[i+4*PacketSize]);
110//
111//             internal::pstore(&a[i+1*PacketSize], internal::padd(a1, b1));
112//             a1 = internal::pload(&a[i+5*PacketSize]);
113//             b1 = internal::pload(&b[i+5*PacketSize]);
114//
115//             internal::pstore(&a[i+2*PacketSize], internal::padd(a2, b2));
116//             a2 = internal::pload(&a[i+6*PacketSize]);
117//             b2 = internal::pload(&b[i+6*PacketSize]);
118//
119//             internal::pstore(&a[i+3*PacketSize], internal::padd(a3, b3));
120//             a3 = internal::pload(&a[i+7*PacketSize]);
121//             b3 = internal::pload(&b[i+7*PacketSize]);
122//
123//             internal::pstore(&a[i+4*PacketSize], internal::padd(a0, b0));
124//             internal::pstore(&a[i+5*PacketSize], internal::padd(a1, b1));
125//             internal::pstore(&a[i+6*PacketSize], internal::padd(a2, b2));
126//             internal::pstore(&a[i+7*PacketSize], internal::padd(a3, b3));
127
128            internal::pstore(&a[i+2*PacketSize], internal::padd(internal::ploadu(&a[i+2*PacketSize]), internal::ploadu(&b[i+2*PacketSize])));
129            internal::pstore(&a[i+3*PacketSize], internal::padd(internal::ploadu(&a[i+3*PacketSize]), internal::ploadu(&b[i+3*PacketSize])));
130            internal::pstore(&a[i+4*PacketSize], internal::padd(internal::ploadu(&a[i+4*PacketSize]), internal::ploadu(&b[i+4*PacketSize])));
131            internal::pstore(&a[i+5*PacketSize], internal::padd(internal::ploadu(&a[i+5*PacketSize]), internal::ploadu(&b[i+5*PacketSize])));
132            internal::pstore(&a[i+6*PacketSize], internal::padd(internal::ploadu(&a[i+6*PacketSize]), internal::ploadu(&b[i+6*PacketSize])));
133            internal::pstore(&a[i+7*PacketSize], internal::padd(internal::ploadu(&a[i+7*PacketSize]), internal::ploadu(&b[i+7*PacketSize])));
134        }
135}
136