RowMajor Matrix-Vector mul x10 slower than ColMajor

Submitted by Nikolai

Assigned to Nobody

Link to original bugzilla bug (#1294)
Version: 3.3 (current stable)
Operating system: Windows

Description

Visual Studio 2015 Upd3, Windows10 x64. Full Optimization. Test code:

#include<Eigen/StdVector>

using HRC = high_resolution_clock;

template <class Type, class Type2, int RM1, int RM2>

Matrix<Type2, 3, 1, RM2> mul(Matrix<Type, 4, 4, RM1> const& m, Matrix<Type2, 3, 1, RM2> const& p) {

Matrix<Type2, 4, 1, RM2> p4;  

p4.template head&lt;3&gt;() = p;  

p4[3] = 1;  

return (m * p4.template cast&lt;Type&gt;()).template head&lt;3&gt;().template cast&lt;Type2&gt;();

}

template <typename T1, typename T2>

void test(string cap, T1 init, T2 f) {

static constexpr size_t N = 2000000;  

static constexpr size_t R = 20;  



auto m = init();  

using VT = Matrix<float, 3, 1>;  

vector<VT, aligned_allocator&lt;VT&gt;> data(N);  

for (auto& i:data) i.setRandom();  



cout << cap << ": ";  



int64_t avg = 0, avgmin = numeric_limits&lt;int64_t&gt;::max();  

for (size_t i = 0; i < R; ++i) {  

    auto t1 = HRC::now();  

    f(data, m);  

    auto t2 = HRC::now();  

    auto dur = duration_cast&lt;microseconds&gt;(t2 - t1).count();  

    avg += dur;  

    avgmin = min(avgmin, dur);         

}  

avg /= R;  

cout << "Avg: " << avg / 1000.0 << ", min: " << avgmin / 1000.0 << endl;

}

int main() {

test("MatFloat ColMajor",  

     [=] {  

         Matrix<float, 4, 4, ColMajor> m{};  

         m.setRandom();  

         return m;  

     },  

     [=](auto& data, auto const& m) {  

         for (size_t j = 0; j < data.size(); ++j) {  

             data[j] = mul(m, data[j]);  

         }  

     }  

);  

test("MatFloat RowMajor",  

     [=] {  

         Matrix<float, 4, 4, RowMajor> m{};  

         m.setRandom();  

         return m;  

     },  

     [=](auto& data, auto const& m) {  

         for (size_t j = 0; j < data.size(); ++j) {  

             data[j] = mul(m, data[j]);  

         }  

     }  

);  

test("MatDouble ColMajor",  

     [=] {  

         Matrix<double, 4, 4, ColMajor> m{};  

         m.setRandom();  

         return m;  

     },  

     [=](auto& data, auto const& m) {  

         for (size_t j = 0; j < data.size(); ++j) {  

             data[j] = mul(m, data[j]);  

         }  

     }  

);  

test("MatDouble RowMajor",  

     [=] {  

         Matrix<double, 4, 4, RowMajor> m{};  

         m.setRandom();  

         return m;  

     },  

     [=](auto& data, auto const& m) {  

         for (size_t j = 0; j < data.size(); ++j) {  

             data[j] = mul(m, data[j]);  

         }  

     }  

);  

return 0;

}

Results:

MatFloat ColMajor: Avg: 15.595, min: 14.881

MatFloat RowMajor: Avg: 243.246, min: 186.862

MatDouble ColMajor: Avg: 111.187, min: 77.764

MatDouble RowMajor: Avg: 270.19, min: 222.239

If I try to mul with something like this:

template <class Type, class Type2, int RM1, int RM2>

Matrix<Type2, 3, 1, RM2> mul(Matrix<Type, 4, 4, RM1> const& m, Matrix<Type2, 3, 1, RM2> const& p) {

using RT = decltype(Type() * Type2());  

Matrix<Type2, 3, 1, RM2> res;  

for (size_t i = 0; i < 3; ++i) {  

    RT sum = 0;  

    for (size_t j = 0; j < 3; ++j) {  

        sum += m(i, j) * p[j];  

    }  

    sum += m(i, 3);  

    res[i] = static_cast&lt;Type2&gt;(sum);  

}  

return res;

}

than RowMajor multiplication works with the same speed as ColMajor.

Depends on

#312

Blocking

#814