RowMajor Matrix-Vector mul x10 slower than ColMajor
Submitted by Nikolai
Assigned to Nobody
Link to original bugzilla bug (#1294)
Version: 3.3 (current stable)
Operating system: Windows
Description
Visual Studio 2015 Upd3, Windows10 x64. Full Optimization. Test code:
#include<Eigen/StdVector>
using HRC = high_resolution_clock;
template <class Type, class Type2, int RM1, int RM2>
Matrix<Type2, 3, 1, RM2> mul(Matrix<Type, 4, 4, RM1> const& m, Matrix<Type2, 3, 1, RM2> const& p) {
Matrix<Type2, 4, 1, RM2> p4;
p4.template head<3>() = p;
p4[3] = 1;
return (m * p4.template cast<Type>()).template head<3>().template cast<Type2>();
}
template <typename T1, typename T2>
void test(string cap, T1 init, T2 f) {
static constexpr size_t N = 2000000;
static constexpr size_t R = 20;
auto m = init();
using VT = Matrix<float, 3, 1>;
vector<VT, aligned_allocator<VT>> data(N);
for (auto& i:data) i.setRandom();
cout << cap << ": ";
int64_t avg = 0, avgmin = numeric_limits<int64_t>::max();
for (size_t i = 0; i < R; ++i) {
auto t1 = HRC::now();
f(data, m);
auto t2 = HRC::now();
auto dur = duration_cast<microseconds>(t2 - t1).count();
avg += dur;
avgmin = min(avgmin, dur);
}
avg /= R;
cout << "Avg: " << avg / 1000.0 << ", min: " << avgmin / 1000.0 << endl;
}
int main() {
test("MatFloat ColMajor",
[=] {
Matrix<float, 4, 4, ColMajor> m{};
m.setRandom();
return m;
},
[=](auto& data, auto const& m) {
for (size_t j = 0; j < data.size(); ++j) {
data[j] = mul(m, data[j]);
}
}
);
test("MatFloat RowMajor",
[=] {
Matrix<float, 4, 4, RowMajor> m{};
m.setRandom();
return m;
},
[=](auto& data, auto const& m) {
for (size_t j = 0; j < data.size(); ++j) {
data[j] = mul(m, data[j]);
}
}
);
test("MatDouble ColMajor",
[=] {
Matrix<double, 4, 4, ColMajor> m{};
m.setRandom();
return m;
},
[=](auto& data, auto const& m) {
for (size_t j = 0; j < data.size(); ++j) {
data[j] = mul(m, data[j]);
}
}
);
test("MatDouble RowMajor",
[=] {
Matrix<double, 4, 4, RowMajor> m{};
m.setRandom();
return m;
},
[=](auto& data, auto const& m) {
for (size_t j = 0; j < data.size(); ++j) {
data[j] = mul(m, data[j]);
}
}
);
return 0;
}
Results:
MatFloat ColMajor: Avg: 15.595, min: 14.881
MatFloat RowMajor: Avg: 243.246, min: 186.862
MatDouble ColMajor: Avg: 111.187, min: 77.764
MatDouble RowMajor: Avg: 270.19, min: 222.239
If I try to mul with something like this:
template <class Type, class Type2, int RM1, int RM2>
Matrix<Type2, 3, 1, RM2> mul(Matrix<Type, 4, 4, RM1> const& m, Matrix<Type2, 3, 1, RM2> const& p) {
using RT = decltype(Type() * Type2());
Matrix<Type2, 3, 1, RM2> res;
for (size_t i = 0; i < 3; ++i) {
RT sum = 0;
for (size_t j = 0; j < 3; ++j) {
sum += m(i, j) * p[j];
}
sum += m(i, 3);
res[i] = static_cast<Type2>(sum);
}
return res;
}
than RowMajor multiplication works with the same speed as ColMajor.
Depends on
Blocking
Edited by David Tellenbach