1 
2 #include <iostream>
3 #include <Eigen/Core>
4 #include <bench/BenchTimer.h>
5 using namespace Eigen;
6 
7 #ifndef SIZE
8 #define SIZE 50
9 #endif
10 
11 #ifndef REPEAT
12 #define REPEAT 10000
13 #endif
14 
15 typedef float Scalar;
16 
17 __attribute__ ((noinline)) void benchVec(Scalar* a, Scalar* b, Scalar* c, int size);
18 __attribute__ ((noinline)) void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c);
19 __attribute__ ((noinline)) void benchVec(VectorXf& a, VectorXf& b, VectorXf& c);
20 
main(int argc,char * argv[])21 int main(int argc, char* argv[])
22 {
23     int size = SIZE * 8;
24     int size2 = size * size;
25     Scalar* a = internal::aligned_new<Scalar>(size2);
26     Scalar* b = internal::aligned_new<Scalar>(size2+4)+1;
27     Scalar* c = internal::aligned_new<Scalar>(size2);
28 
29     for (int i=0; i<size; ++i)
30     {
31         a[i] = b[i] = c[i] = 0;
32     }
33 
34     BenchTimer timer;
35 
36     timer.reset();
37     for (int k=0; k<10; ++k)
38     {
39         timer.start();
40         benchVec(a, b, c, size2);
41         timer.stop();
42     }
43     std::cout << timer.value() << "s  " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
44     return 0;
45     for (int innersize = size; innersize>2 ; --innersize)
46     {
47         if (size2%innersize==0)
48         {
49             int outersize = size2/innersize;
50             MatrixXf ma = Map<MatrixXf>(a, innersize, outersize );
51             MatrixXf mb = Map<MatrixXf>(b, innersize, outersize );
52             MatrixXf mc = Map<MatrixXf>(c, innersize, outersize );
53             timer.reset();
54             for (int k=0; k<3; ++k)
55             {
56                 timer.start();
57                 benchVec(ma, mb, mc);
58                 timer.stop();
59             }
60             std::cout << innersize << " x " << outersize << "  " << timer.value() << "s   " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
61         }
62     }
63 
64     VectorXf va = Map<VectorXf>(a, size2);
65     VectorXf vb = Map<VectorXf>(b, size2);
66     VectorXf vc = Map<VectorXf>(c, size2);
67     timer.reset();
68     for (int k=0; k<3; ++k)
69     {
70         timer.start();
71         benchVec(va, vb, vc);
72         timer.stop();
73     }
74     std::cout << timer.value() << "s   " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
75 
76     return 0;
77 }
78 
benchVec(MatrixXf & a,MatrixXf & b,MatrixXf & c)79 void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c)
80 {
81     for (int k=0; k<REPEAT; ++k)
82         a = a + b;
83 }
84 
benchVec(VectorXf & a,VectorXf & b,VectorXf & c)85 void benchVec(VectorXf& a, VectorXf& b, VectorXf& c)
86 {
87     for (int k=0; k<REPEAT; ++k)
88         a = a + b;
89 }
90 
benchVec(Scalar * a,Scalar * b,Scalar * c,int size)91 void benchVec(Scalar* a, Scalar* b, Scalar* c, int size)
92 {
93     typedef internal::packet_traits<Scalar>::type PacketScalar;
94     const int PacketSize = internal::packet_traits<Scalar>::size;
95     PacketScalar a0, a1, a2, a3, b0, b1, b2, b3;
96     for (int k=0; k<REPEAT; ++k)
97         for (int i=0; i<size; i+=PacketSize*8)
98         {
99 //             a0 = internal::pload(&a[i]);
100 //             b0 = internal::pload(&b[i]);
101 //             a1 = internal::pload(&a[i+1*PacketSize]);
102 //             b1 = internal::pload(&b[i+1*PacketSize]);
103 //             a2 = internal::pload(&a[i+2*PacketSize]);
104 //             b2 = internal::pload(&b[i+2*PacketSize]);
105 //             a3 = internal::pload(&a[i+3*PacketSize]);
106 //             b3 = internal::pload(&b[i+3*PacketSize]);
107 //             internal::pstore(&a[i], internal::padd(a0, b0));
108 //             a0 = internal::pload(&a[i+4*PacketSize]);
109 //             b0 = internal::pload(&b[i+4*PacketSize]);
110 //
111 //             internal::pstore(&a[i+1*PacketSize], internal::padd(a1, b1));
112 //             a1 = internal::pload(&a[i+5*PacketSize]);
113 //             b1 = internal::pload(&b[i+5*PacketSize]);
114 //
115 //             internal::pstore(&a[i+2*PacketSize], internal::padd(a2, b2));
116 //             a2 = internal::pload(&a[i+6*PacketSize]);
117 //             b2 = internal::pload(&b[i+6*PacketSize]);
118 //
119 //             internal::pstore(&a[i+3*PacketSize], internal::padd(a3, b3));
120 //             a3 = internal::pload(&a[i+7*PacketSize]);
121 //             b3 = internal::pload(&b[i+7*PacketSize]);
122 //
123 //             internal::pstore(&a[i+4*PacketSize], internal::padd(a0, b0));
124 //             internal::pstore(&a[i+5*PacketSize], internal::padd(a1, b1));
125 //             internal::pstore(&a[i+6*PacketSize], internal::padd(a2, b2));
126 //             internal::pstore(&a[i+7*PacketSize], internal::padd(a3, b3));
127 
128             internal::pstore(&a[i+2*PacketSize], internal::padd(internal::ploadu(&a[i+2*PacketSize]), internal::ploadu(&b[i+2*PacketSize])));
129             internal::pstore(&a[i+3*PacketSize], internal::padd(internal::ploadu(&a[i+3*PacketSize]), internal::ploadu(&b[i+3*PacketSize])));
130             internal::pstore(&a[i+4*PacketSize], internal::padd(internal::ploadu(&a[i+4*PacketSize]), internal::ploadu(&b[i+4*PacketSize])));
131             internal::pstore(&a[i+5*PacketSize], internal::padd(internal::ploadu(&a[i+5*PacketSize]), internal::ploadu(&b[i+5*PacketSize])));
132             internal::pstore(&a[i+6*PacketSize], internal::padd(internal::ploadu(&a[i+6*PacketSize]), internal::ploadu(&b[i+6*PacketSize])));
133             internal::pstore(&a[i+7*PacketSize], internal::padd(internal::ploadu(&a[i+7*PacketSize]), internal::ploadu(&b[i+7*PacketSize])));
134         }
135 }
136