Skip to content

dsyrk blocking errors when compiled for Arm

Summary

I discovered an example where dsyrk from Eigen's blas returns the wrong result. If you set EIGEN_DONT_VECTORIZE it will return the correct result.

Environment

  • Operating System : Linux
  • Architecture : Arm64
  • Eigen Version : Internal, stays relatively close to head
  • Compiler Version : Clang close to head
  • Compile Flags : -std=gnu++20 -DEIGEN_USE_CUSTOM_PLAIN_ASSERT -DEIGEN_AVOID_STL_ARRAY -DEIGEN_ALLOW_UNALIGNED_SCALARS '-DEIGEN_USE_AVX512_GEMM_KERNELS=0' -D_COMPLEX_DEFINED -O1 -mcpu=neoverse-n1 -mtune=generic
  • Vector Extension : NEON

Minimal Example

#include <iostream>

#include "eigen3/Eigen/Dense"
#include "lapack/blas.h" // This will point to Eigen's blas

using Mat = Eigen::MatrixXd;

// A is a 72 x 72, but we specifically set a 8x64 block of it
Mat CreateA();

int main() {
  auto A = CreateA();
  Mat C = Mat::Zero(72, 72);

  // Blocking choice came from CreateA
  Mat expected = -1.0 * A * A.transpose();
  Mat C_eig = expected.triangularView<Eigen::Lower>();

  char Lc = 'L';
  char Nc = 'N';
  int N = 8;
  int K = 64;
  double alpha = -1.0;
  int LDA = 72;
  double beta = 1.0;
  int LDC = 72;

  dsyrk_(&Lc, &Nc, &N, &K, &alpha, A.data(), &LDA, &beta, C.data(), &LDC);
  Mat C_lapack = C.triangularView<Eigen::Lower>();

  // Print one bigger to ensure zeros
  std::cout << "C_eig:\n" << C_eig.block(0, 0, 9, 9) << "\n";
  std::cout << "C_lapack:\n" << C_lapack.block(0, 0, 9, 9) << "\n";

  std::cout << "C_eig norm: " << C_eig.norm() << "\n";
  std::cout << "C_lapack norm: " << C_lapack.norm() << "\n";

  std::cout << "C_eig(8,8) norm: " << C_eig.block(0, 0, 8, 8).norm() << "\n";
  std::cout << "C_lapack(8,8) norm: " << C_lapack.block(0, 0, 8, 8).norm()
            << "\n";
}


// Entries taken from a syrk input from a subroutine call inside dpotrf.  The
// initial matrix fed to dpotrf_ was randomly generated
Mat CreateA() {
  Mat A(72, 72);
  A.setZero();

  // Pick luckily the simplest block fails
  auto Ablock = A.block(0, 0, 8, 64);
  Ablock(0, 0) = 0.6014972515825333;
  Ablock(1, 0) = 0.3463291377848287;
  Ablock(1, 1) = 0.1433834457923558;
  Ablock(2, 0) = 0.08834935294690587;
  Ablock(2, 1) = -0.24390927030593823;
  Ablock(2, 2) = 0.3374517442475161;
  Ablock(3, 0) = -0.09852634261866884;
  Ablock(3, 1) = -0.18653364899151578;
  Ablock(3, 2) = -0.2366083911488236;
  Ablock(3, 3) = -0.22813981104706446;
  Ablock(4, 0) = -0.6090523238388824;
  Ablock(4, 1) = -0.13335976785466266;
  Ablock(4, 2) = 0.3559039950058597;
  Ablock(4, 3) = 0.30894238530243967;
  Ablock(4, 4) = 0.08961005352392773;
  Ablock(5, 0) = -0.5117131786556051;
  Ablock(5, 1) = -0.0862605796645381;
  Ablock(5, 2) = -0.3072855197079403;
  Ablock(5, 3) = 0.20850034127874192;
  Ablock(5, 4) = -0.2300442975436204;
  Ablock(5, 5) = 0.21898534483535628;
  Ablock(6, 0) = 0.1945492628240792;
  Ablock(6, 1) = -0.018396002521279774;
  Ablock(6, 2) = 0.1077637187869938;
  Ablock(6, 3) = -0.4624790491866455;
  Ablock(6, 4) = -0.24510522102852658;
  Ablock(6, 5) = 0.10175703936029593;
  Ablock(6, 6) = 0.08345159392284988;
  Ablock(7, 0) = -0.11818767733937548;
  Ablock(7, 1) = -0.11805774374855535;
  Ablock(7, 2) = -0.22272657206698346;
  Ablock(7, 3) = -0.15570098411904953;
  Ablock(7, 4) = -0.08250572840305137;
  Ablock(7, 5) = -0.25874563785041493;
  Ablock(7, 6) = -0.060486232339923236;
  Ablock(7, 7) = -0.10994995939632299;

  return A;
}

Steps to reproduce

Run the code above on an Arm machine

What is the current bug behavior?

C_eig:
 -0.361799          0          0          0          0          0          0          0          0
 -0.208316  -0.140503          0          0          0          0          0          0          0
-0.0531419  0.0043746  -0.181171          0          0          0          0          0          0
 0.0592633  0.0608684  0.0430514  -0.152534          0          0          0          0          0
  0.366343   0.230054 -0.0988187  0.0698081  -0.618873          0          0          0          0
  0.307794    0.18959   0.127864 -0.0916468    -0.2576  -0.508063          0          0          0
 -0.117021 -0.0647404 -0.0580403 -0.0642753   0.242527   0.148839  -0.341083          0          0
 0.0710896  0.0578594  0.0568059  -0.121887  0.0470386 -0.0689573 -0.0160308   -0.19126          0
         0          0          0          0          0          0          0          0          0
C_lapack:
 -0.361799          0          0          0          0          0          0          0          0
 -0.120988 -0.0465727          0          0          0          0          0          0          0
 -0.201695  -0.203795   0.121954          0          0          0          0          0          0
-0.0847701 -0.0470829 -0.0169131  -0.102823          0          0          0          0          0
  0.326321   0.118309   0.142007  0.0482944          0          0          0          0          0
  0.205125   0.262417  0.0449074 -0.0989727          0          0          0          0          0
 -0.226117  -0.100868  0.0759129  -0.033795          0          0          0          0          0
  0.104479 -0.0575596  -0.018904  -0.193595          0          0          0          0          0
         0          0          0          0          0          0          0          0          0
C_eig norm: 1.27268
C_lapack norm: 0.806882
C_eig(8,8) norm: 1.27268
C_lapack(8,8) norm: 0.806882

What is the expected correct behavior?

When run on an X86 machine

C_eig:
 -0.361799          0          0          0          0          0          0          0          0
 -0.208316  -0.140503          0          0          0          0          0          0          0
-0.0531419  0.0043746  -0.181171          0          0          0          0          0          0
 0.0592633  0.0608684  0.0430514  -0.152534          0          0          0          0          0
  0.366343   0.230054 -0.0988187  0.0698081  -0.618873          0          0          0          0
  0.307794    0.18959   0.127864 -0.0916468    -0.2576  -0.508063          0          0          0
 -0.117021 -0.0647404 -0.0580403 -0.0642753   0.242527   0.148839  -0.341083          0          0
 0.0710896  0.0578594  0.0568059  -0.121887  0.0470386 -0.0689573 -0.0160308   -0.19126          0
         0          0          0          0          0          0          0          0          0
C_lapack:
 -0.361799          0          0          0          0          0          0          0          0
 -0.208316  -0.140503          0          0          0          0          0          0          0
-0.0531419  0.0043746  -0.181171          0          0          0          0          0          0
 0.0592633  0.0608684  0.0430514  -0.152534          0          0          0          0          0
  0.366343   0.230054 -0.0988187  0.0698081  -0.618873          0          0          0          0
  0.307794    0.18959   0.127864 -0.0916468    -0.2576  -0.508063          0          0          0
 -0.117021 -0.0647404 -0.0580403 -0.0642753   0.242527   0.148839  -0.341083          0          0
 0.0710896  0.0578594  0.0568059  -0.121887  0.0470386 -0.0689573 -0.0160308   -0.19126          0
         0          0          0          0          0          0          0          0          0
C_eig norm: 1.27268
C_lapack norm: 1.27268
C_eig(8,8) norm: 1.27268
C_lapack(8,8) norm: 1.27268

Finally, if you turn off vectorization for Eigen then you will get the correct result on Arm.