dsyrk blocking errors when compiled for Arm
Summary
I discovered an example where dsyrk from Eigen's blas returns the wrong result. If you set EIGEN_DONT_VECTORIZE it will return the correct result.
Environment
- Operating System : Linux
- Architecture : Arm64
- Eigen Version : Internal, stays relatively close to head
- Compiler Version : Clang close to head
- Compile Flags : -std=gnu++20 -DEIGEN_USE_CUSTOM_PLAIN_ASSERT -DEIGEN_AVOID_STL_ARRAY -DEIGEN_ALLOW_UNALIGNED_SCALARS '-DEIGEN_USE_AVX512_GEMM_KERNELS=0' -D_COMPLEX_DEFINED -O1 -mcpu=neoverse-n1 -mtune=generic
- Vector Extension : NEON
Minimal Example
#include <iostream>
#include "eigen3/Eigen/Dense"
#include "lapack/blas.h" // This will point to Eigen's blas
using Mat = Eigen::MatrixXd;
// A is a 72 x 72, but we specifically set a 8x64 block of it
Mat CreateA();
int main() {
auto A = CreateA();
Mat C = Mat::Zero(72, 72);
// Blocking choice came from CreateA
Mat expected = -1.0 * A * A.transpose();
Mat C_eig = expected.triangularView<Eigen::Lower>();
char Lc = 'L';
char Nc = 'N';
int N = 8;
int K = 64;
double alpha = -1.0;
int LDA = 72;
double beta = 1.0;
int LDC = 72;
dsyrk_(&Lc, &Nc, &N, &K, &alpha, A.data(), &LDA, &beta, C.data(), &LDC);
Mat C_lapack = C.triangularView<Eigen::Lower>();
// Print one bigger to ensure zeros
std::cout << "C_eig:\n" << C_eig.block(0, 0, 9, 9) << "\n";
std::cout << "C_lapack:\n" << C_lapack.block(0, 0, 9, 9) << "\n";
std::cout << "C_eig norm: " << C_eig.norm() << "\n";
std::cout << "C_lapack norm: " << C_lapack.norm() << "\n";
std::cout << "C_eig(8,8) norm: " << C_eig.block(0, 0, 8, 8).norm() << "\n";
std::cout << "C_lapack(8,8) norm: " << C_lapack.block(0, 0, 8, 8).norm()
<< "\n";
}
// Entries taken from a syrk input from a subroutine call inside dpotrf. The
// initial matrix fed to dpotrf_ was randomly generated
Mat CreateA() {
Mat A(72, 72);
A.setZero();
// Pick luckily the simplest block fails
auto Ablock = A.block(0, 0, 8, 64);
Ablock(0, 0) = 0.6014972515825333;
Ablock(1, 0) = 0.3463291377848287;
Ablock(1, 1) = 0.1433834457923558;
Ablock(2, 0) = 0.08834935294690587;
Ablock(2, 1) = -0.24390927030593823;
Ablock(2, 2) = 0.3374517442475161;
Ablock(3, 0) = -0.09852634261866884;
Ablock(3, 1) = -0.18653364899151578;
Ablock(3, 2) = -0.2366083911488236;
Ablock(3, 3) = -0.22813981104706446;
Ablock(4, 0) = -0.6090523238388824;
Ablock(4, 1) = -0.13335976785466266;
Ablock(4, 2) = 0.3559039950058597;
Ablock(4, 3) = 0.30894238530243967;
Ablock(4, 4) = 0.08961005352392773;
Ablock(5, 0) = -0.5117131786556051;
Ablock(5, 1) = -0.0862605796645381;
Ablock(5, 2) = -0.3072855197079403;
Ablock(5, 3) = 0.20850034127874192;
Ablock(5, 4) = -0.2300442975436204;
Ablock(5, 5) = 0.21898534483535628;
Ablock(6, 0) = 0.1945492628240792;
Ablock(6, 1) = -0.018396002521279774;
Ablock(6, 2) = 0.1077637187869938;
Ablock(6, 3) = -0.4624790491866455;
Ablock(6, 4) = -0.24510522102852658;
Ablock(6, 5) = 0.10175703936029593;
Ablock(6, 6) = 0.08345159392284988;
Ablock(7, 0) = -0.11818767733937548;
Ablock(7, 1) = -0.11805774374855535;
Ablock(7, 2) = -0.22272657206698346;
Ablock(7, 3) = -0.15570098411904953;
Ablock(7, 4) = -0.08250572840305137;
Ablock(7, 5) = -0.25874563785041493;
Ablock(7, 6) = -0.060486232339923236;
Ablock(7, 7) = -0.10994995939632299;
return A;
}
Steps to reproduce
Run the code above on an Arm machine
What is the current bug behavior?
C_eig:
-0.361799 0 0 0 0 0 0 0 0
-0.208316 -0.140503 0 0 0 0 0 0 0
-0.0531419 0.0043746 -0.181171 0 0 0 0 0 0
0.0592633 0.0608684 0.0430514 -0.152534 0 0 0 0 0
0.366343 0.230054 -0.0988187 0.0698081 -0.618873 0 0 0 0
0.307794 0.18959 0.127864 -0.0916468 -0.2576 -0.508063 0 0 0
-0.117021 -0.0647404 -0.0580403 -0.0642753 0.242527 0.148839 -0.341083 0 0
0.0710896 0.0578594 0.0568059 -0.121887 0.0470386 -0.0689573 -0.0160308 -0.19126 0
0 0 0 0 0 0 0 0 0
C_lapack:
-0.361799 0 0 0 0 0 0 0 0
-0.120988 -0.0465727 0 0 0 0 0 0 0
-0.201695 -0.203795 0.121954 0 0 0 0 0 0
-0.0847701 -0.0470829 -0.0169131 -0.102823 0 0 0 0 0
0.326321 0.118309 0.142007 0.0482944 0 0 0 0 0
0.205125 0.262417 0.0449074 -0.0989727 0 0 0 0 0
-0.226117 -0.100868 0.0759129 -0.033795 0 0 0 0 0
0.104479 -0.0575596 -0.018904 -0.193595 0 0 0 0 0
0 0 0 0 0 0 0 0 0
C_eig norm: 1.27268
C_lapack norm: 0.806882
C_eig(8,8) norm: 1.27268
C_lapack(8,8) norm: 0.806882
What is the expected correct behavior?
When run on an X86 machine
C_eig:
-0.361799 0 0 0 0 0 0 0 0
-0.208316 -0.140503 0 0 0 0 0 0 0
-0.0531419 0.0043746 -0.181171 0 0 0 0 0 0
0.0592633 0.0608684 0.0430514 -0.152534 0 0 0 0 0
0.366343 0.230054 -0.0988187 0.0698081 -0.618873 0 0 0 0
0.307794 0.18959 0.127864 -0.0916468 -0.2576 -0.508063 0 0 0
-0.117021 -0.0647404 -0.0580403 -0.0642753 0.242527 0.148839 -0.341083 0 0
0.0710896 0.0578594 0.0568059 -0.121887 0.0470386 -0.0689573 -0.0160308 -0.19126 0
0 0 0 0 0 0 0 0 0
C_lapack:
-0.361799 0 0 0 0 0 0 0 0
-0.208316 -0.140503 0 0 0 0 0 0 0
-0.0531419 0.0043746 -0.181171 0 0 0 0 0 0
0.0592633 0.0608684 0.0430514 -0.152534 0 0 0 0 0
0.366343 0.230054 -0.0988187 0.0698081 -0.618873 0 0 0 0
0.307794 0.18959 0.127864 -0.0916468 -0.2576 -0.508063 0 0 0
-0.117021 -0.0647404 -0.0580403 -0.0642753 0.242527 0.148839 -0.341083 0 0
0.0710896 0.0578594 0.0568059 -0.121887 0.0470386 -0.0689573 -0.0160308 -0.19126 0
0 0 0 0 0 0 0 0 0
C_eig norm: 1.27268
C_lapack norm: 1.27268
C_eig(8,8) norm: 1.27268
C_lapack(8,8) norm: 1.27268
Finally, if you turn off vectorization for Eigen then you will get the correct result on Arm.