RowMajor Matrix-Vector mul x10 slower than ColMajor

Submitted by Nikolai

Assigned to Nobody

Link to original bugzilla bug (#1294)
Version: 3.3 (current stable)
Operating system: Windows

Description

Visual Studio 2015 Upd3, Windows10 x64. Full Optimization. Test code:

#include<Eigen/StdVector>  
using HRC = high_resolution_clock;  
  
template <class Type, class Type2, int RM1, int RM2>  
Matrix<Type2, 3, 1, RM2> mul(Matrix<Type, 4, 4, RM1> const& m, Matrix<Type2, 3, 1, RM2> const& p) {  
    Matrix<Type2, 4, 1, RM2> p4;  
    p4.template head&lt;3&gt;() = p;  
    p4[3] = 1;  
    return (m * p4.template cast&lt;Type&gt;()).template head&lt;3&gt;().template cast&lt;Type2&gt;();  
}  
  
template <typename T1, typename T2>  
void test(string cap, T1 init, T2 f) {  
    static constexpr size_t N = 2000000;  
    static constexpr size_t R = 20;  
  
    auto m = init();  
    using VT = Matrix<float, 3, 1>;  
    vector<VT, aligned_allocator&lt;VT&gt;> data(N);  
    for (auto& i:data) i.setRandom();  
  
    cout << cap << ": ";  
  
    int64_t avg = 0, avgmin = numeric_limits&lt;int64_t&gt;::max();  
    for (size_t i = 0; i < R; ++i) {  
        auto t1 = HRC::now();  
        f(data, m);  
        auto t2 = HRC::now();  
        auto dur = duration_cast&lt;microseconds&gt;(t2 - t1).count();  
        avg += dur;  
        avgmin = min(avgmin, dur);         
    }  
    avg /= R;  
    cout << "Avg: " << avg / 1000.0 << ", min: " << avgmin / 1000.0 << endl;  
}  
  
int main() {  
    test("MatFloat ColMajor",  
         [=] {  
             Matrix<float, 4, 4, ColMajor> m{};  
             m.setRandom();  
             return m;  
         },  
         [=](auto& data, auto const& m) {  
             for (size_t j = 0; j < data.size(); ++j) {  
                 data[j] = mul(m, data[j]);  
             }  
         }  
    );  
    test("MatFloat RowMajor",  
         [=] {  
             Matrix<float, 4, 4, RowMajor> m{};  
             m.setRandom();  
             return m;  
         },  
         [=](auto& data, auto const& m) {  
             for (size_t j = 0; j < data.size(); ++j) {  
                 data[j] = mul(m, data[j]);  
             }  
         }  
    );  
    test("MatDouble ColMajor",  
         [=] {  
             Matrix<double, 4, 4, ColMajor> m{};  
             m.setRandom();  
             return m;  
         },  
         [=](auto& data, auto const& m) {  
             for (size_t j = 0; j < data.size(); ++j) {  
                 data[j] = mul(m, data[j]);  
             }  
         }  
    );  
    test("MatDouble RowMajor",  
         [=] {  
             Matrix<double, 4, 4, RowMajor> m{};  
             m.setRandom();  
             return m;  
         },  
         [=](auto& data, auto const& m) {  
             for (size_t j = 0; j < data.size(); ++j) {  
                 data[j] = mul(m, data[j]);  
             }  
         }  
    );  
    return 0;  
}  

Results:
MatFloat ColMajor: Avg: 15.595, min: 14.881
MatFloat RowMajor: Avg: 243.246, min: 186.862
MatDouble ColMajor: Avg: 111.187, min: 77.764
MatDouble RowMajor: Avg: 270.19, min: 222.239

If I try to mul with something like this:

template <class Type, class Type2, int RM1, int RM2>  
Matrix<Type2, 3, 1, RM2> mul(Matrix<Type, 4, 4, RM1> const& m, Matrix<Type2, 3, 1, RM2> const& p) {  
    using RT = decltype(Type() * Type2());  
    Matrix<Type2, 3, 1, RM2> res;  
    for (size_t i = 0; i < 3; ++i) {  
        RT sum = 0;  
        for (size_t j = 0; j < 3; ++j) {  
            sum += m(i, j) * p[j];  
        }  
        sum += m(i, 3);  
        res[i] = static_cast&lt;Type2&gt;(sum);  
    }  
    return res;  
}  

than RowMajor multiplication works with the same speed as ColMajor.

Depends on

#312

Blocking

#814 (closed)

Edited by David Tellenbach