Compare revisions

Sebastiaan van der Tol · André Offringa · Marcel Loose · Maik Nijhuis · Tammo Jan Dijkema · André Offringa
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -13,7 +13,7 @@ before_script:
    pkg-config
    python3-pip
    clang-format-14
- - pip3 install gcovr black cmake-format
+ - pip3 install gcovr black cmake-format isort

 format:
  script:
@@ -23,8 +23,8 @@ aocommon:
  script:
    - mkdir build
    - cd build
-    - cmake -DCMAKE_CXX_FLAGS="-coverage" -DCMAKE_EXE_LINKER_FLAGS="-coverage" ../
-    - make
+    - cmake -DCMAKE_CXX_FLAGS="-coverage -fprofile-update=atomic" -DCMAKE_EXE_LINKER_FLAGS="-coverage" ../
+    - make -j`nproc` runtests
    # gcovr to create badge
    # This also writes the xml file to enable line coverage highlighting in MRs
    - make coverage

--- a/CMake/FetchXTensor.cmake
+++ b/CMake/FetchXTensor.cmake
@@ -9,7 +9,7 @@ if (NOT xsimd_GIT_TAG)
  set(xsimd_GIT_TAG 2f5eddf8912c7e2527f0c50895c7560b964d29af)
 endif()
 if (NOT xtensor_GIT_TAG)
-  set(xtensor_GIT_TAG 0.24.2)
+  set(xtensor_GIT_TAG ae52796961d03e7a3d754d72713be5098ce467b9)
 endif()
 if (NOT xtensor-blas_GIT_TAG)
  set(xtensor-blas_GIT_TAG 0.20.0)

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED YES)
 set(CMAKE_CXX_EXTENSIONS OFF)

 add_compile_options(
+  -O3
  -Wall
  -Wnon-virtual-dtor
  -Wzero-as-null-pointer-constant
@@ -110,10 +111,8 @@ add_custom_command(

 add_custom_target(
  coverage
-  COMMAND gcovr --gcov-ignore-parse-errors -r .. -e '.*/tests/.*' -e
-          '.*/CompilerIdCXX/.*' -e '_deps/.*'
-  COMMAND gcovr --gcov-ignore-parse-errors -r .. -e '.*/tests/.*' -e
-          '.*/CompilerIdCXX/.*' -e '_deps/.*' --xml > coverage.xml
+  COMMAND gcovr -r ${CMAKE_SOURCE_DIR} -e '.*/tests/.*' -e '.*/CompilerIdCXX/.*'
+          -e '_deps/.*' --txt --xml coverage.xml ${CMAKE_BINARY_DIR}
  DEPENDS execute_runtests)

 # CMAKE_CXX_FLAGS does not contain all flags.

--- a/include/aocommon/avx/DiagonalMatrixComplexDouble2x2.h
+++ b/include/aocommon/avx/DiagonalMatrixComplexDouble2x2.h
@@ -60,9 +60,14 @@ class DiagonalMatrixComplexDouble2x2 {

  AVX_TARGET VectorComplexDouble2 Data() const noexcept { return data_; }

-  AVX_TARGET std::complex<double> operator[](size_t index) const noexcept {
+  AVX_TARGET std::complex<double> Get(size_t index) const noexcept {
    assert(index < 2);
-    return data_[index];
+    return data_.Get(index);
+  }
+
+  AVX_TARGET void Set(size_t index, std::complex<double> value) noexcept {
+    assert(index < 2);
+    data_.Set(index, value);
  }

  AVX_TARGET DiagonalMatrixComplexDouble2x2 Conjugate() const noexcept {
@@ -86,7 +91,8 @@ class DiagonalMatrixComplexDouble2x2 {
  template <typename T>
  AVX_TARGET DiagonalMatrixComplexDouble2x2& operator*=(T value) noexcept {
    // TODO could use avx
-    *this = DiagonalMatrixComplexDouble2x2(data_[0] * value, data_[1] * value);
+    *this = DiagonalMatrixComplexDouble2x2(data_.Get(0) * value,
+                                           data_.Get(1) * value);
    return *this;
  }

@@ -99,7 +105,8 @@ class DiagonalMatrixComplexDouble2x2 {
  template <typename T>
  AVX_TARGET DiagonalMatrixComplexDouble2x2& operator/=(T value) noexcept {
    // TODO could use avx
-    *this = DiagonalMatrixComplexDouble2x2(data_[0] / value, data_[1] / value);
+    *this = DiagonalMatrixComplexDouble2x2(data_.Get(0) / value,
+                                           data_.Get(1) / value);
    return *this;
  }

@@ -111,8 +118,8 @@ class DiagonalMatrixComplexDouble2x2 {

  AVX_TARGET friend std::ostream& operator<<(
      std::ostream& output, DiagonalMatrixComplexDouble2x2 value) {
-    output << "[{" << value[0] << ", " << std::complex<double>{} << "}, {"
-           << std::complex<double>{} << ", " << value[1] << "}]";
+    output << "[{" << value.Get(0) << ", " << std::complex<double>{} << "}, {"
+           << std::complex<double>{} << ", " << value.Get(1) << "}]";
    return output;
  }

@@ -139,7 +146,7 @@ class DiagonalMatrixComplexDouble2x2 {
  template <typename T>
  AVX_TARGET friend DiagonalMatrixComplexDouble2x2 operator*(
      DiagonalMatrixComplexDouble2x2 lhs, T rhs) noexcept {
-    return VectorComplexDouble2(lhs.data_[0] * rhs, lhs.data_[1] * rhs);
+    return lhs.data_ * rhs;
  }

  template <typename T>
@@ -151,13 +158,36 @@ class DiagonalMatrixComplexDouble2x2 {
  template <typename T>
  AVX_TARGET friend DiagonalMatrixComplexDouble2x2 operator/(
      DiagonalMatrixComplexDouble2x2 lhs, T rhs) noexcept {
-    return VectorComplexDouble2(lhs.data_[0] / rhs, lhs.data_[1] / rhs);
+    return VectorComplexDouble2(lhs.data_.Get(0) / rhs, lhs.data_.Get(1) / rhs);
  }

 private:
  VectorComplexDouble2 data_;
 };

+AVX_TARGET inline DiagonalMatrixComplexDouble2x2 HermTranspose(
+    DiagonalMatrixComplexDouble2x2 matrix) noexcept {
+  return matrix.HermTranspose();
+}
+
+/// @returns the sum of the diagonal elements.
+AVX_TARGET inline std::complex<double> Trace(
+    DiagonalMatrixComplexDouble2x2 matrix) noexcept {
+  return matrix.Get(0) + matrix.Get(1);
+}
+
+/// @returns the Frobenius norm of the matrix.
+AVX_TARGET inline double Norm(DiagonalMatrixComplexDouble2x2 matrix) noexcept {
+  return std::norm(matrix.Get(0)) + std::norm(matrix.Get(1));
+}
+
+/// Returns the original diagonal matrix. Can be useful for templated code so
+/// that it can be used for both the diagonal and full matrix variants.
+AVX_TARGET inline DiagonalMatrixComplexDouble2x2 Diagonal(
+    DiagonalMatrixComplexDouble2x2 matrix) noexcept {
+  return matrix;
+}
+
 }  // namespace aocommon::avx

 #endif  // AOCOMMON_AVX256_DIAGONAL_MATRIX_COMPLEX_DOUBLE_2X2_H
--- a/include/aocommon/avx/DiagonalMatrixComplexFloat2x2.h
+++ b/include/aocommon/avx/DiagonalMatrixComplexFloat2x2.h
@@ -38,15 +38,32 @@ class DiagonalMatrixComplexFloat2x2 {
      const std::complex<float> matrix[2]) noexcept
      : data_{VectorComplexFloat2{std::addressof(matrix[0])}} {}

-  AVX_TARGET std::complex<float> operator[](size_t index) const noexcept {
+  AVX_TARGET const std::complex<float> Get(size_t index) const noexcept {
    assert(index < 2 && "Index out of bounds.");
-    return data_[index];
+    return data_.Get(index);
+  }
+
+  AVX_TARGET void Set(size_t index, std::complex<float> value) noexcept {
+    assert(index < 2 && "Index out of bounds.");
+    data_.Set(index, value);
  }

  AVX_TARGET explicit operator __m128() const noexcept {
    return static_cast<__m128>(data_);
  }

+  template <typename T>
+  AVX_TARGET friend DiagonalMatrixComplexFloat2x2 operator*(
+      DiagonalMatrixComplexFloat2x2 lhs, T rhs) noexcept {
+    return lhs.data_ * rhs;
+  }
+
+  AVX_TARGET friend DiagonalMatrixComplexFloat2x2 operator*(
+      DiagonalMatrixComplexFloat2x2 lhs,
+      DiagonalMatrixComplexFloat2x2 rhs) noexcept {
+    return lhs.data_ * rhs.data_;
+  }
+
  AVX_TARGET DiagonalMatrixComplexFloat2x2 Conjugate() const noexcept {
    return data_.Conjugate();
  }
@@ -65,6 +82,11 @@ class DiagonalMatrixComplexFloat2x2 {
    return data_ += value.data_;
  }

+  AVX_TARGET DiagonalMatrixComplexFloat2x2
+  operator-=(DiagonalMatrixComplexFloat2x2 value) noexcept {
+    return data_ -= value.data_;
+  }
+
  AVX_TARGET friend bool operator==(
      DiagonalMatrixComplexFloat2x2 lhs,
      DiagonalMatrixComplexFloat2x2 rhs) noexcept {
@@ -73,8 +95,8 @@ class DiagonalMatrixComplexFloat2x2 {

  AVX_TARGET friend std::ostream& operator<<(
      std::ostream& output, DiagonalMatrixComplexFloat2x2 value) {
-    output << "[{" << value[0] << ", " << std::complex<float>{} << "}, {"
-           << std::complex<float>{} << ", " << value[1] << "}]";
+    output << "[{" << value.Get(0) << ", " << std::complex<float>{} << "}, {"
+           << std::complex<float>{} << ", " << value.Get(1) << "}]";
    return output;
  }

@@ -82,6 +104,29 @@ class DiagonalMatrixComplexFloat2x2 {
  VectorComplexFloat2 data_;
 };

+AVX_TARGET inline DiagonalMatrixComplexFloat2x2 HermTranspose(
+    DiagonalMatrixComplexFloat2x2 matrix) noexcept {
+  return matrix.HermTranspose();
+}
+
+/// @returns the sum of the diagonal elements.
+AVX_TARGET inline std::complex<float> Trace(
+    DiagonalMatrixComplexFloat2x2 matrix) noexcept {
+  return matrix.Get(0) + matrix.Get(1);
+}
+
+/// @returns the Frobenius norm of the matrix.
+AVX_TARGET inline float Norm(DiagonalMatrixComplexFloat2x2 matrix) noexcept {
+  return std::norm(matrix.Get(0)) + std::norm(matrix.Get(1));
+}
+
+/// Returns the original diagonal matrix. Can be useful for templated code so
+/// that it can be used for both the diagonal and full matrix variants.
+AVX_TARGET inline DiagonalMatrixComplexFloat2x2 Diagonal(
+    DiagonalMatrixComplexFloat2x2 matrix) noexcept {
+  return matrix;
+}
+
 }  // namespace aocommon::avx

 #endif  // AOCOMMON_AVX256_DIAGONAL_MATRIX_COMPLEX_FLOAT_2X2_H
--- a/include/aocommon/avx/MatrixComplexDouble2x2.h
+++ b/include/aocommon/avx/MatrixComplexDouble2x2.h
@@ -44,7 +44,7 @@ class MatrixComplexDouble2x2 {

  AVX_TARGET explicit MatrixComplexDouble2x2(
      DiagonalMatrixComplexDouble2x2 m) noexcept
-      : MatrixComplexDouble2x2(m[0], 0.0, 0.0, m[1]) {}
+      : MatrixComplexDouble2x2(m.Get(0), 0.0, 0.0, m.Get(1)) {}

  AVX_TARGET explicit MatrixComplexDouble2x2(
      const std::complex<float> matrix[4]) noexcept
@@ -71,11 +71,18 @@ class MatrixComplexDouble2x2 {
      : data_{VectorComplexDouble2(matrix[0], matrix[1]),
              VectorComplexDouble2(matrix[2], matrix[3])} {}

-  AVX_TARGET std::complex<double> operator[](size_t index) const noexcept {
+  AVX_TARGET std::complex<double> Get(size_t index) const noexcept {
    assert(index < 4 && "Index out of bounds.");
    size_t array = index / 2;
    index %= 2;
-    return data_[array][index];
+    return data_[array].Get(index);
+  }
+
+  AVX_TARGET void Set(size_t index, std::complex<double> value) noexcept {
+    assert(index < 4 && "Index out of bounds.");
+    size_t array = index / 2;
+    index %= 2;
+    data_[array].Set(index, value);
  }

  AVX_TARGET MatrixComplexDouble2x2 Conjugate() const noexcept {
@@ -85,8 +92,7 @@ class MatrixComplexDouble2x2 {

  AVX_TARGET MatrixComplexDouble2x2 Transpose() const noexcept {
    // Note the compiler uses intrinsics without assistance.
-    return MatrixComplexDouble2x2{(*this)[0], (*this)[2], (*this)[1],
-                                  (*this)[3]};
+    return MatrixComplexDouble2x2{Get(0), Get(2), Get(1), Get(3)};
  }

  /** @returns A^H * A. */
@@ -97,10 +103,9 @@ class MatrixComplexDouble2x2 {
  }

  AVX_TARGET bool Invert() noexcept {
-    VectorComplexDouble2 v =
-        data_[0] * VectorComplexDouble2{(*this)[3], (*this)[2]};
+    VectorComplexDouble2 v = data_[0] * VectorComplexDouble2{Get(3), Get(2)};

-    std::complex<double> d = v[0] - v[1];
+    std::complex<double> d = v.Get(0) - v.Get(1);
    if (d == std::complex<double>{}) return false;

    double n = std::norm(d);
@@ -166,16 +171,15 @@ class MatrixComplexDouble2x2 {
  }

  AVX_TARGET MatrixComplexDouble2x2& operator+=(
-      MatrixComplexDouble2x2 value) noexcept {
-    data_[0] += value.data_[0];
-    data_[1] += value.data_[1];
+      MatrixComplexDouble2x2 rhs) noexcept {
+    data_[0] += rhs.data_[0];
+    data_[1] += rhs.data_[1];
    return *this;
  }

-  AVX_TARGET MatrixComplexDouble2x2& operator-=(
-      MatrixComplexDouble2x2 value) noexcept {
-    data_[0] -= value.data_[0];
-    data_[1] -= value.data_[1];
+  AVX_TARGET MatrixComplexDouble2x2& operator+=(
+      const DiagonalMatrixComplexDouble2x2& rhs) {
+    *this = *this + rhs;
    return *this;
  }

@@ -184,6 +188,44 @@ class MatrixComplexDouble2x2 {
    return lhs += rhs;
  }

+  AVX_TARGET MatrixComplexDouble2x2
+  operator+(const DiagonalMatrixComplexDouble2x2& rhs) const {
+    return MatrixComplexDouble2x2(Get(0) + rhs.Get(0), Get(1), Get(2),
+                                  Get(3) + rhs.Get(1));
+  }
+
+  AVX_TARGET friend MatrixComplexDouble2x2 operator+(
+      const DiagonalMatrixComplexDouble2x2& lhs,
+      const MatrixComplexDouble2x2& rhs) {
+    return rhs + lhs;
+  }
+
+  AVX_TARGET MatrixComplexDouble2x2& operator-=(
+      MatrixComplexDouble2x2 rhs) noexcept {
+    data_[0] -= rhs.data_[0];
+    data_[1] -= rhs.data_[1];
+    return *this;
+  }
+
+  AVX_TARGET MatrixComplexDouble2x2& operator-=(
+      const DiagonalMatrixComplexDouble2x2& rhs) {
+    *this = *this - rhs;
+    return *this;
+  }
+
+  AVX_TARGET MatrixComplexDouble2x2
+  operator-(const DiagonalMatrixComplexDouble2x2& rhs) const {
+    return MatrixComplexDouble2x2(Get(0) - rhs.Get(0), Get(1), Get(2),
+                                  Get(3) - rhs.Get(1));
+  }
+
+  AVX_TARGET friend MatrixComplexDouble2x2 operator-(
+      const DiagonalMatrixComplexDouble2x2& lhs,
+      const MatrixComplexDouble2x2& rhs) {
+    return MatrixComplexDouble2x2(lhs.Get(0) - rhs.Get(0), -rhs.Get(1),
+                                  -rhs.Get(2), lhs.Get(1) - rhs.Get(3));
+  }
+
  AVX_TARGET friend MatrixComplexDouble2x2 operator-(
      MatrixComplexDouble2x2 lhs, MatrixComplexDouble2x2 rhs) noexcept {
    return lhs -= rhs;
@@ -206,23 +248,23 @@ class MatrixComplexDouble2x2 {
    //       | ls1           | ls2           |

    // High:
-    VectorComplexDouble2 hc1{lhs[0], lhs[0]};
-    VectorComplexDouble2 hc2{rhs[0], rhs[1]};
+    VectorComplexDouble2 hc1{lhs.Get(0), lhs.Get(0)};
+    VectorComplexDouble2 hc2{rhs.Get(0), rhs.Get(1)};
    VectorComplexDouble2 hs1 = hc1 * hc2;

-    VectorComplexDouble2 hc3{lhs[1], lhs[1]};
-    VectorComplexDouble2 hc4{rhs[2], rhs[3]};
+    VectorComplexDouble2 hc3{lhs.Get(1), lhs.Get(1)};
+    VectorComplexDouble2 hc4{rhs.Get(2), rhs.Get(3)};
    VectorComplexDouble2 hs2 = hc3 * hc4;

    VectorComplexDouble2 hr = hs1 + hs2;

    // Low:
-    VectorComplexDouble2 lc1{lhs[2], lhs[2]};
-    VectorComplexDouble2 lc2{rhs[0], rhs[1]};
+    VectorComplexDouble2 lc1{lhs.Get(2), lhs.Get(2)};
+    VectorComplexDouble2 lc2{rhs.Get(0), rhs.Get(1)};
    VectorComplexDouble2 ls1 = lc1 * lc2;

-    VectorComplexDouble2 lc3{lhs[3], lhs[3]};
-    VectorComplexDouble2 lc4{rhs[2], rhs[3]};
+    VectorComplexDouble2 lc3{lhs.Get(3), lhs.Get(3)};
+    VectorComplexDouble2 lc4{rhs.Get(2), rhs.Get(3)};
    VectorComplexDouble2 ls2 = lc3 * lc4;

    VectorComplexDouble2 lr = ls1 + ls2;
@@ -273,7 +315,7 @@ class MatrixComplexDouble2x2 {
  }

  AVX_TARGET DiagonalMatrixComplexDouble2x2 Diagonal() const noexcept {
-    return DiagonalMatrixComplexDouble2x2(data_[0][0], data_[1][1]);
+    return DiagonalMatrixComplexDouble2x2(data_[0].Get(0), data_[1].Get(1));
  }

  AVX_TARGET friend MatrixComplexDouble2x2 operator*(
@@ -307,19 +349,19 @@ class MatrixComplexDouble2x2 {
    // r[0][1] = d[0][0] * m[0][1] -> lhs[0] * rhs[1]
    // r[1][0] = d[1][1] * m[1][0] -> lhs[1] * rhs[2]
    // r[1][1] = d[1][1] * m[1][1] -> lhs[1] * rhs[3]
-    return std::array<VectorComplexDouble2, 2>{lhs.Data()[0] * rhs.data_[0],
-                                               lhs.Data()[1] * rhs.data_[1]};
+    return std::array<VectorComplexDouble2, 2>{
+        lhs.Data().Get(0) * rhs.data_[0], lhs.Data().Get(1) * rhs.data_[1]};
  }

  AVX_TARGET MatrixComplexDouble2x2& operator*=(
-      MatrixComplexDouble2x2 value) noexcept {
-    *this = *this * value;
+      MatrixComplexDouble2x2 rhs) noexcept {
+    *this = *this * rhs;
    return *this;
  }

  template <typename T>
-  AVX_TARGET MatrixComplexDouble2x2& operator*=(const T* value) noexcept {
-    *this = *this * MatrixComplexDouble2x2(value);
+  AVX_TARGET MatrixComplexDouble2x2& operator*=(const T* rhs) noexcept {
+    *this = *this * MatrixComplexDouble2x2(rhs);
    return *this;
  }

@@ -336,8 +378,8 @@ class MatrixComplexDouble2x2 {

  AVX_TARGET friend std::ostream& operator<<(std::ostream& output,
                                             MatrixComplexDouble2x2 value) {
-    output << "[{" << value[0] << ", " << value[1] << "}, {" << value[2] << ", "
-           << value[3] << "}]";
+    output << "[{" << value.Get(0) << ", " << value.Get(1) << "}, {"
+           << value.Get(2) << ", " << value.Get(3) << "}]";
    return output;
  }

@@ -349,7 +391,8 @@ class MatrixComplexDouble2x2 {
   * Flatten 2x2 matrix to length 4 vector
   */
  AVX_TARGET Vector4 Vec() const {
-    return Vector4(data_[0][0], data_[1][0], data_[0][1], data_[1][1]);
+    return Vector4(data_[0].Get(0), data_[1].Get(0), data_[0].Get(1),
+                   data_[1].Get(1));
  }

  /**
@@ -377,8 +420,8 @@ class MatrixComplexDouble2x2 {
  }

  AVX_TARGET std::complex<double> DoubleDot(MatrixComplexDouble2x2 rhs) const {
-    return (*this)[0] * rhs[0] + (*this)[1] * rhs[1] + (*this)[2] * rhs[2] +
-           (*this)[3] * rhs[3];
+    return Get(0) * rhs.Get(0) + Get(1) * rhs.Get(1) + Get(2) * rhs.Get(2) +
+           Get(3) * rhs.Get(3);
  }

  /// Element-wise product
@@ -483,6 +526,11 @@ AVX_TARGET inline double Norm(MatrixComplexDouble2x2 matrix) noexcept {
  return ret[0];
 }

+AVX_TARGET inline double SumOfAbsolute(MatrixComplexDouble2x2 matrix) {
+  return std::abs(matrix.Get(0)) + std::abs(matrix.Get(1)) +
+         std::abs(matrix.Get(2)) + std::abs(matrix.Get(3));
+}
+
 }  // namespace aocommon::avx

 #endif  // AOCOMMON_AVX256_MATRIX_COMPLEX_DOUBLE_2X2_H
--- a/include/aocommon/avx/MatrixComplexFloat2x2.h
+++ b/include/aocommon/avx/MatrixComplexFloat2x2.h
@@ -53,9 +53,14 @@ class MatrixComplexFloat2x2 {
  AVX_TARGET explicit MatrixComplexFloat2x2(
      const MatrixComplexDouble2x2& matrix) noexcept;

-  AVX_TARGET std::complex<float> operator[](size_t index) const noexcept {
+  AVX_TARGET const std::complex<float> Get(size_t index) const noexcept {
    assert(index < 4 && "Index out of bounds.");
-    return data_[index];
+    return data_.Get(index);
+  }
+
+  AVX_TARGET void Set(size_t index, std::complex<float> value) noexcept {
+    assert(index < 4 && "Index out of bounds.");
+    data_.Set(index, value);
  }

  AVX_TARGET MatrixComplexFloat2x2 Conjugate() const noexcept {
@@ -64,7 +69,8 @@ class MatrixComplexFloat2x2 {

  AVX_TARGET MatrixComplexFloat2x2 Transpose() const noexcept {
    // Note the compiler uses intrinsics without assistance.
-    return MatrixComplexFloat2x2{data_[0], data_[2], data_[1], data_[3]};
+    return MatrixComplexFloat2x2{data_.Get(0), data_.Get(2), data_.Get(1),
+                                 data_.Get(3)};
  }

  AVX_TARGET explicit operator __m256() const noexcept {
@@ -113,31 +119,69 @@ class MatrixComplexFloat2x2 {
                            std::numeric_limits<float>::quiet_NaN()}};
  }

+  AVX_TARGET friend MatrixComplexFloat2x2 operator+(
+      MatrixComplexFloat2x2 lhs, MatrixComplexFloat2x2 rhs) noexcept {
+    return lhs += rhs;
+  }
+
+  AVX_TARGET MatrixComplexFloat2x2
+  operator+(const DiagonalMatrixComplexFloat2x2& rhs) const {
+    return MatrixComplexFloat2x2(Get(0) + rhs.Get(0), Get(1), Get(2),
+                                 Get(3) + rhs.Get(1));
+  }
+
+  AVX_TARGET friend MatrixComplexFloat2x2 operator+(
+      const DiagonalMatrixComplexFloat2x2& lhs,
+      const MatrixComplexFloat2x2& rhs) {
+    return MatrixComplexFloat2x2(lhs.Get(0) + rhs.Get(0), rhs.Get(1),
+                                 rhs.Get(2), lhs.Get(1) + rhs.Get(3));
+  }
+
  AVX_TARGET MatrixComplexFloat2x2& operator+=(
-      MatrixComplexFloat2x2 value) noexcept {
-    data_ += value.data_;
+      MatrixComplexFloat2x2 rhs) noexcept {
+    data_ += rhs.data_;
    return *this;
  }

-  AVX_TARGET MatrixComplexFloat2x2& operator-=(
-      MatrixComplexFloat2x2 value) noexcept {
-    data_ -= value.data_;
+  AVX_TARGET MatrixComplexFloat2x2& operator+=(
+      const DiagonalMatrixComplexFloat2x2& rhs) {
+    *this = *this + rhs;
    return *this;
  }

-  AVX_TARGET friend MatrixComplexFloat2x2 operator+(
+  AVX_TARGET friend MatrixComplexFloat2x2 operator-(
      MatrixComplexFloat2x2 lhs, MatrixComplexFloat2x2 rhs) noexcept {
-    return lhs += rhs;
+    return lhs -= rhs;
+  }
+
+  AVX_TARGET MatrixComplexFloat2x2
+  operator-(const DiagonalMatrixComplexFloat2x2& rhs) const {
+    return MatrixComplexFloat2x2(Get(0) - rhs.Get(0), Get(1), Get(2),
+                                 Get(3) - rhs.Get(1));
  }

  AVX_TARGET friend MatrixComplexFloat2x2 operator-(
-      MatrixComplexFloat2x2 lhs, MatrixComplexFloat2x2 rhs) noexcept {
-    return lhs -= rhs;
+      const DiagonalMatrixComplexFloat2x2& lhs,
+      const MatrixComplexFloat2x2& rhs) {
+    return MatrixComplexFloat2x2(lhs.Get(0) - rhs.Get(0), -rhs.Get(1),
+                                 -rhs.Get(2), lhs.Get(1) - rhs.Get(3));
+  }
+
+  AVX_TARGET MatrixComplexFloat2x2& operator-=(
+      MatrixComplexFloat2x2 rhs) noexcept {
+    data_ -= rhs.data_;
+    return *this;
+  }
+
+  AVX_TARGET MatrixComplexFloat2x2& operator-=(
+      const DiagonalMatrixComplexFloat2x2& rhs) {
+    *this = *this - rhs;
+    return *this;
  }

  AVX_TARGET MatrixComplexFloat2x2& operator*=(
-      MatrixComplexFloat2x2 value) noexcept {
-    data_ = data_ * value.data_;
+      MatrixComplexFloat2x2 rhs) noexcept {
+    data_ = data_ * rhs.data_;
    return *this;
  }

@@ -152,12 +196,12 @@ class MatrixComplexFloat2x2 {
    //       | s1            | s2            |
    //

-    VectorComplexFloat4 c1{lhs[0], lhs[0], lhs[2], lhs[2]};
-    VectorComplexFloat4 c2{rhs[0], rhs[1], rhs[0], rhs[1]};
+    VectorComplexFloat4 c1{lhs.Get(0), lhs.Get(0), lhs.Get(2), lhs.Get(2)};
+    VectorComplexFloat4 c2{rhs.Get(0), rhs.Get(1), rhs.Get(0), rhs.Get(1)};
    VectorComplexFloat4 s1 = c1 * c2;

-    VectorComplexFloat4 c3{lhs[1], lhs[1], lhs[3], lhs[3]};
-    VectorComplexFloat4 c4{rhs[2], rhs[3], rhs[2], rhs[3]};
+    VectorComplexFloat4 c3{lhs.Get(1), lhs.Get(1), lhs.Get(3), lhs.Get(3)};
+    VectorComplexFloat4 c4{rhs.Get(2), rhs.Get(3), rhs.Get(2), rhs.Get(3)};
    VectorComplexFloat4 s2 = c3 * c4;

    return s1 + s2;
@@ -174,19 +218,19 @@ class MatrixComplexFloat2x2 {
  }

  AVX_TARGET DiagonalMatrixComplexFloat2x2 Diagonal() const noexcept {
-    return VectorComplexFloat2{data_[0], data_[3]};
+    return VectorComplexFloat2{data_.Get(0), data_.Get(3)};
  }

  AVX_TARGET bool Invert() noexcept {
    // Note it would be possible to optimize further.
-    VectorComplexFloat2 a{data_[0], data_[1]};
-    VectorComplexFloat2 b{data_[3], data_[2]};
+    VectorComplexFloat2 a{data_.Get(0), data_.Get(1)};
+    VectorComplexFloat2 b{data_.Get(3), data_.Get(2)};
    VectorComplexFloat2 c = a * b;

-    std::complex<float> d = c[0] - c[1];
+    std::complex<float> d = c.Get(0) - c.Get(1);
    if (d == std::complex<float>{}) return false;

-    float n = std::norm(d);
+    const float n = std::norm(d);
    d.imag(-d.imag());
    __m256 reciprocal = _mm256_setr_ps(d.real(), d.imag(), d.real(), d.imag(),
                                       d.real(), d.imag(), d.real(), d.imag());
@@ -254,8 +298,8 @@ class MatrixComplexFloat2x2 {

  AVX_TARGET friend std::ostream& operator<<(std::ostream& output,
                                             MatrixComplexFloat2x2 value) {
-    output << "[{" << value[0] << ", " << value[1] << "}, {" << value[2] << ", "
-           << value[3] << "}]";
+    output << "[{" << value.Get(0) << ", " << value.Get(1) << "}, {"
+           << value.Get(2) << ", " << value.Get(3) << "}]";
    return output;
  }

@@ -266,8 +310,8 @@ class MatrixComplexFloat2x2 {
  template <typename Scalar>
  AVX_TARGET MatrixComplexFloat2x2& operator/=(Scalar scalar) {
    // TODO could be explicitly vectorized
-    data_ = VectorComplexFloat4(data_[0] / scalar, data_[1] / scalar,
-                                data_[2] / scalar, data_[3] / scalar);
+    data_ = VectorComplexFloat4(data_.Get(0) / scalar, data_.Get(1) / scalar,
+                                data_.Get(2) / scalar, data_.Get(3) / scalar);
    return *this;
  }

@@ -279,8 +323,8 @@ class MatrixComplexFloat2x2 {
  }

  AVX_TARGET std::complex<float> DoubleDot(MatrixComplexFloat2x2 rhs) const {
-    return (*this)[0] * rhs[0] + (*this)[1] * rhs[1] + (*this)[2] * rhs[2] +
-           (*this)[3] * rhs[3];
+    return (*this).Get(0) * rhs.Get(0) + (*this).Get(1) * rhs.Get(1) +
+           (*this).Get(2) * rhs.Get(2) + (*this).Get(3) * rhs.Get(3);
  }

  /**
@@ -393,6 +437,11 @@ AVX_TARGET inline float Norm(MatrixComplexFloat2x2 matrix) noexcept {
  return ret[0] + ret[1];
 }

+AVX_TARGET inline float SumOfAbsolute(MatrixComplexFloat2x2 matrix) {
+  return std::abs(matrix.Get(0)) + std::abs(matrix.Get(1)) +
+         std::abs(matrix.Get(2)) + std::abs(matrix.Get(3));
+}
+
 }  // namespace aocommon::avx

 #endif  // AOCOMMON_AVX256_MATRIX_COMPLEX_FLOAT_2X2_H
--- a/include/aocommon/avx/VectorComplexDouble2.h
+++ b/include/aocommon/avx/VectorComplexDouble2.h
@@ -52,11 +52,21 @@ class VectorComplexDouble2 {
      : data_{VectorDouble4{
            reinterpret_cast<const double*>(std::addressof(vector[0]))}} {}

-  AVX_TARGET std::complex<double> operator[](size_t index) const noexcept {
+  AVX_TARGET std::complex<double> Get(size_t index) const noexcept {
    assert(index < 2 && "Index out of bounds.");
    return {data_[2 * index], data_[2 * index + 1]};
  }

+  AVX_TARGET void Set(size_t index, std::complex<double> value) noexcept {
+    assert(index < 2 && "Index out of bounds.");
+    if (index == 0)
+      data_ = VectorDouble4(value.real(), value.imag(), Get(1).real(),
+                            Get(1).imag());
+    else
+      data_ = VectorDouble4(Get(0).real(), Get(0).imag(), value.real(),
+                            value.imag());
+  }
+
  AVX_TARGET explicit operator __m256d() const noexcept {
    return data_.Value();
  }
@@ -74,9 +84,9 @@ class VectorComplexDouble2 {
  }

  AVX_TARGET void AssignTo(std::complex<float>* destination) const noexcept {
-    std::complex<double> values[4];
+    std::complex<double> values[2];
    data_.AssignTo(reinterpret_cast<double*>(values));
-    std::copy_n(values, 4, destination);
+    std::copy_n(values, 2, destination);
  }

  AVX_TARGET static VectorComplexDouble2 Zero() noexcept {
@@ -84,14 +94,14 @@ class VectorComplexDouble2 {
  }

  AVX_TARGET VectorComplexDouble2& operator+=(
-      VectorComplexDouble2 value) noexcept {
-    data_ += value.data_;
+      VectorComplexDouble2 rhs) noexcept {
+    data_ += rhs.data_;
    return *this;
  }

  AVX_TARGET VectorComplexDouble2& operator-=(
-      VectorComplexDouble2 value) noexcept {
-    data_ -= value.data_;
+      VectorComplexDouble2 rhs) noexcept {
+    data_ -= rhs.data_;
    return *this;
  }

@@ -172,7 +182,7 @@ class VectorComplexDouble2 {

  AVX_TARGET friend std::ostream& operator<<(std::ostream& output,
                                             VectorComplexDouble2 value) {
-    output << '[' << value[0] << ", " << value[1] << ']';
+    output << '[' << value.Get(0) << ", " << value.Get(1) << ']';
    return output;
  }


--- a/include/aocommon/avx/VectorComplexFloat2.h
+++ b/include/aocommon/avx/VectorComplexFloat2.h
@@ -39,11 +39,21 @@ class VectorComplexFloat2 {
      : data_{VectorFloat4{
            reinterpret_cast<const float*>(std::addressof(vector[0]))}} {}

-  AVX_TARGET std::complex<float> operator[](size_t index) const noexcept {
+  AVX_TARGET std::complex<float> Get(size_t index) const noexcept {
    assert(index < 2 && "Index out of bounds.");
    return {data_[2 * index], data_[2 * index + 1]};
  }

+  AVX_TARGET void Set(size_t index, std::complex<float> value) noexcept {
+    assert(index < 2 && "Index out of bounds.");
+    if (index == 0)
+      data_ = VectorFloat4(value.real(), value.imag(), Get(1).real(),
+                           Get(1).imag());
+    else
+      data_ = VectorFloat4(Get(0).real(), Get(0).imag(), value.real(),
+                           value.imag());
+  }
+
  AVX_TARGET explicit operator __m128() const noexcept { return data_.Value(); }

  AVX_TARGET VectorComplexFloat2 Conjugate() const noexcept {
@@ -152,7 +162,7 @@ class VectorComplexFloat2 {

  AVX_TARGET friend std::ostream& operator<<(std::ostream& output,
                                             VectorComplexFloat2 value) {
-    output << '[' << value[0] << ", " << value[1] << ']';
+    output << '[' << value.Get(0) << ", " << value.Get(1) << ']';
    return output;
  }


--- a/include/aocommon/avx/VectorComplexFloat4.h
+++ b/include/aocommon/avx/VectorComplexFloat4.h
@@ -50,11 +50,29 @@ class VectorComplexFloat4 {
      : data_{VectorFloat8{
            reinterpret_cast<const double*>(std::addressof(vector[0]))}} {}

-  AVX_TARGET std::complex<float> operator[](size_t index) const noexcept {
+  AVX_TARGET std::complex<float> Get(size_t index) const noexcept {
    assert(index < 4 && "Index out of bounds.");
    return {data_[2 * index], data_[2 * index + 1]};
  }

+  AVX_TARGET void Set(size_t index, std::complex<float> value) noexcept {
+    assert(index < 4 && "Index out of bounds.");
+    switch (index) {
+      case 0:
+        *this = VectorComplexFloat4(value, Get(1), Get(2), Get(3));
+        break;
+      case 1:
+        *this = VectorComplexFloat4(Get(0), value, Get(2), Get(3));
+        break;
+      case 2:
+        *this = VectorComplexFloat4(Get(0), Get(1), value, Get(3));
+        break;
+      case 3:
+        *this = VectorComplexFloat4(Get(0), Get(1), Get(2), value);
+        break;
+    }
+  }
+
  AVX_TARGET explicit operator __m256() const noexcept { return data_.Value(); }

  AVX_TARGET VectorComplexFloat4 Conjugate() const noexcept {
@@ -169,8 +187,8 @@ class VectorComplexFloat4 {

  AVX_TARGET friend std::ostream& operator<<(std::ostream& output,
                                             VectorComplexFloat4 value) {
-    output << '[' << value[0] << ", " << value[1] << ", " << value[2] << ", "
-           << value[3] << ']';
+    output << '[' << value.Get(0) << ", " << value.Get(1) << ", "
+           << value.Get(2) << ", " << value.Get(3) << ']';
    return output;
  }


--- a/include/aocommon/fits/fitsreader.h
+++ b/include/aocommon/fits/fitsreader.h
@@ -101,11 +101,11 @@ class FitsReader : public FitsBase {
    for (int i = 0; i != naxis; ++i) firstPixel[i] = 1;
    if (naxis > 2) firstPixel[2] = index + 1;

-    if (sizeof(NumType) == 8)
+    if constexpr (sizeof(NumType) == 8)
      fits_read_pix(_fitsPtr, TDOUBLE, &firstPixel[0],
                    _meta.imgWidth * _meta.imgHeight, nullptr, image, nullptr,
                    &status);
-    else if (sizeof(NumType) == 4)
+    else if constexpr (sizeof(NumType) == 4)
      fits_read_pix(_fitsPtr, TFLOAT, &firstPixel[0],
                    _meta.imgWidth * _meta.imgHeight, nullptr, image, nullptr,
                    &status);

--- a/include/aocommon/hmatrix4x4.h
+++ b/include/aocommon/hmatrix4x4.h
@@ -245,20 +245,20 @@ class HMatrix4x4 {
    HMatrix4x4 result;

    // top left submatrix
-    result._data[0] = (hma[0] * hmb[0]).real();
-    result.SetComplex(1, hma[0] * hmb[2]);
-    result._data[3] = (hma[0] * hmb[3]).real();
+    result._data[0] = (hma.Get(0) * hmb.Get(0)).real();
+    result.SetComplex(1, hma.Get(0) * hmb.Get(2));
+    result._data[3] = (hma.Get(0) * hmb.Get(3)).real();

    // bottom left submatrix
-    result.SetComplex(4, hma[2] * hmb[0]);
-    result.SetComplex(6, hma[2] * hmb[1]);
-    result.SetComplex(9, hma[2] * hmb[2]);
-    result.SetComplex(11, hma[2] * hmb[3]);
+    result.SetComplex(4, hma.Get(2) * hmb.Get(0));
+    result.SetComplex(6, hma.Get(2) * hmb.Get(1));
+    result.SetComplex(9, hma.Get(2) * hmb.Get(2));
+    result.SetComplex(11, hma.Get(2) * hmb.Get(3));

    // bottom right submatrix
-    result._data[8] = (hma[3] * hmb[0]).real();
-    result.SetComplex(13, hma[3] * hmb[2]);
-    result._data[15] = (hma[3] * hmb[3]).real();
+    result._data[8] = (hma.Get(3) * hmb.Get(0)).real();
+    result.SetComplex(13, hma.Get(3) * hmb.Get(2));
+    result._data[15] = (hma.Get(3) * hmb.Get(3)).real();

    return result;
  }

--- a/include/aocommon/image.h
+++ b/include/aocommon/image.h
@@ -42,11 +42,14 @@ inline NumT MedianWithCopyImplementation(const NumT* data, size_t size,
  }
 }

+/**
+ * Returns the median and the median-absolute-deviation (MAD).
+ */
 template <typename NumT>
-inline NumT MadImplementation(const NumT* data, size_t size) {
+inline std::pair<NumT, NumT> MadImplementation(const NumT* data, size_t size) {
  aocommon::UVector<NumT> copy;
-  NumT median = MedianWithCopyImplementation(data, size, copy);
-  if (copy.empty()) return 0.0;
+  const NumT median = MedianWithCopyImplementation(data, size, copy);
+  if (copy.empty()) return {0.0, 0.0};

  // Replace all values by the difference from the mean
  typename aocommon::UVector<NumT>::iterator mid =
@@ -59,13 +62,13 @@ inline NumT MadImplementation(const NumT* data, size_t size) {
    *i = *i - median;

  std::nth_element(copy.begin(), mid, copy.end());
-  median = *mid;
+  NumT mad = *mid;
  bool even = (copy.size() % 2) == 0;
  if (even) {
    std::nth_element(mid, mid + 1, copy.end());
-    median = (median + *(mid + 1)) * 0.5;
+    mad = (mad + *(mid + 1)) * 0.5;
  }
-  return median;
+  return {median, mad};
 }
 }  // namespace detail

@@ -414,7 +417,7 @@ class ImageBase {
      std::fill_n(ptr, start_x, 0.0);
      std::copy_n(&input[(y - start_y) * in_width], in_width,
                  &output[y * out_width + start_x]);
-      std::fill_n(ptr + end_x, out_width, 0.0);
+      std::fill_n(ptr + end_x, out_width - end_x, 0.0);
    }
    for (size_t y = end_y; y != out_height; ++y) {
      value_type* ptr = &output[y * out_width];
@@ -540,11 +543,34 @@ class ImageBase {
  value_type StdDevFromMAD() const {
    return StdDevFromMAD(data_, width_ * height_);
  }
+
  static value_type StdDevFromMAD(const value_type* data, size_t size) {
    // norminv(0.75) x MAD
    return value_type(1.48260221850560) * MAD(data, size);
  }

+  /**
+   * Calculates both median and median-absolute-deviation (MAD). Because
+   * calculating MAD requires calculating the median, this is more efficient
+   * than calling
+   * @ref Median() and @ref MAD() after each other.
+   * @returns {median, mad}
+   */
+  std::pair<value_type, value_type> MedianAndMAD() const;
+
+  /**
+   * Calculates both median and a standard deviation estimate from the
+   * median-absolute-deviation (MAD). Because calculating MAD requires
+   * calculating the median, this is more efficient than calling
+   * @ref Median() and @ref StdDevFromMAD() after each other.
+   * @returns {median, stddev_estimate}
+   */
+  std::pair<value_type, value_type> MedianAndStdDevFromMAD() const {
+    const std::pair<value_type, value_type> median_and_mad = MedianAndMAD();
+    return {median_and_mad.first,
+            value_type(1.48260221850560) * median_and_mad.second};
+  }
+
  value_type RMS() const { return RMS(data_, width_ * height_); }

  static value_type RMS(const value_type* data, size_t size) {
@@ -662,13 +688,13 @@ typename ImageBase<NumT>::value_type ImageBase<NumT>::Median(
 template <>
 inline typename ImageBase<float>::value_type ImageBase<float>::MAD(
    const value_type* data, size_t size) {
-  return detail::MadImplementation(data, size);
+  return detail::MadImplementation(data, size).second;
 }

 template <>
 inline typename ImageBase<double>::value_type ImageBase<double>::MAD(
    const value_type* data, size_t size) {
-  return detail::MadImplementation(data, size);
+  return detail::MadImplementation(data, size).second;
 }

 template <typename NumT>
@@ -677,5 +703,26 @@ typename ImageBase<NumT>::value_type ImageBase<NumT>::MAD(const value_type*,
  throw std::runtime_error("not implemented");
 }

+template <typename NumT>
+typename std::pair<typename ImageBase<NumT>::value_type,
+                   typename ImageBase<NumT>::value_type>
+ImageBase<NumT>::MedianAndMAD() const {
+  throw std::runtime_error("not implemented");
+}
+
+template <>
+inline typename std::pair<typename ImageBase<float>::value_type,
+                          typename ImageBase<float>::value_type>
+ImageBase<float>::MedianAndMAD() const {
+  return detail::MadImplementation(data_, width_ * height_);
+}
+
+template <>
+inline typename std::pair<typename ImageBase<double>::value_type,
+                          typename ImageBase<double>::value_type>
+ImageBase<double>::MedianAndMAD() const {
+  return detail::MadImplementation(data_, width_ * height_);
+}
+
 }  // namespace aocommon
 #endif
--- a/include/aocommon/matrix4x4.h
+++ b/include/aocommon/matrix4x4.h
@@ -217,10 +217,10 @@ class Matrix4x4 {
    Matrix4x4 result;
    const size_t posa[4] = {0, 2, 8, 10};
    for (size_t i = 0; i != 4; ++i) {
-      result[posa[i]] = veca[i] * vecb[0];
-      result[posa[i] + 1] = veca[i] * vecb[1];
-      result[posa[i] + 4] = veca[i] * vecb[2];
-      result[posa[i] + 5] = veca[i] * vecb[3];
+      result[posa[i]] = veca.Get(i) * vecb.Get(0);
+      result[posa[i] + 1] = veca.Get(i) * vecb.Get(1);
+      result[posa[i] + 4] = veca.Get(i) * vecb.Get(2);
+      result[posa[i] + 5] = veca.Get(i) * vecb.Get(3);
    }
    return result;
  }

--- a/include/aocommon/py/uniqueptr.h
+++ b/include/aocommon/py/uniqueptr.h
@@ -34,28 +34,28 @@ class PyUniquePointer {

  /// @return A reference to the wrapped object.
  /// @throw std::runtime_error If the PyUniquePointer holds a null pointer.
-  [[nodiscard]] T& operator*() {
+  T& operator*() {
    CheckPointer();
    return *pointer_;
  }

  /// @return A const reference to the wrapped object.
  /// @throw std::runtime_error If the PyUniquePointer holds a null pointer.
-  [[nodiscard]] const T& operator*() const {
+  const T& operator*() const {
    CheckPointer();
    return *pointer_;
  }

  /// @return A pointer to the wrapped object.
  /// @throw std::runtime_error If the PyUniquePointer holds a null pointer.
-  [[nodiscard]] T* operator->() {
+  T* operator->() {
    CheckPointer();
    return pointer_.get();
  }

  /// @return A const pointer to the wrapped object.
  /// @throw std::runtime_error If the PyUniquePointer holds a null pointer.
-  [[nodiscard]] const T* operator->() const {
+  const T* operator->() const {
    CheckPointer();
    return pointer_.get();
  }
@@ -84,4 +84,4 @@ class PyUniquePointer {

 }  // namespace aocommon::py

-#endif
\ No newline at end of file
+#endif
--- a/include/aocommon/scalar/matrix2x2.h
+++ b/include/aocommon/scalar/matrix2x2.h
--- a/include/aocommon/scalar/matrix2x2diag.h
+++ b/include/aocommon/scalar/matrix2x2diag.h
@@ -148,10 +148,16 @@ class MC2x2DiagBase {
    return *this;
  }

-  const std::complex<ValType>& operator[](size_t index) const {
-    return _values[index];
+  MC2x2DiagBase<ValType>& operator-=(const MC2x2DiagBase<ValType>& source) {
+    _values[0] -= source._values[0];
+    _values[1] -= source._values[1];
+    return *this;
+  }
+
+  std::complex<ValType> Get(size_t index) const { return _values[index]; }
+  void Set(size_t index, std::complex<ValType> value) {
+    _values[index] = value;
  }
-  std::complex<ValType>& operator[](size_t index) { return _values[index]; }

  /**
   * Return MC2x2Base matrix filled with zeros
@@ -186,7 +192,8 @@ class MC2x2DiagBase {
   * Matrix multiplication, alias for the overloaded * operator
   */
  MC2x2DiagBase<ValType> Multiply(const MC2x2DiagBase<ValType>& rhs) const {
-    return MC2x2DiagBase<ValType>(_values[0] * rhs[0], _values[1] * rhs[1]);
+    return MC2x2DiagBase<ValType>(_values[0] * rhs.Get(0),
+                                  _values[1] * rhs.Get(1));
  }

  /**
@@ -222,8 +229,9 @@ class MC2x2DiagBase {
 template <typename ValType>
 scalar::MC2x2Base<ValType> operator*(const MC2x2DiagBase<ValType>& lhs,
                                     const scalar::MC2x2Base<ValType>& rhs) {
-  return scalar::MC2x2Base<ValType>(lhs[0] * rhs[0], lhs[0] * rhs[1],
-                                    lhs[1] * rhs[2], lhs[1] * rhs[3]);
+  return scalar::MC2x2Base<ValType>(
+      lhs.Get(0) * rhs.Get(0), lhs.Get(0) * rhs.Get(1), lhs.Get(1) * rhs.Get(2),
+      lhs.Get(1) * rhs.Get(3));
 }

 /**
@@ -232,8 +240,9 @@ scalar::MC2x2Base<ValType> operator*(const MC2x2DiagBase<ValType>& lhs,
 template <typename ValType>
 scalar::MC2x2Base<ValType> operator*(const scalar::MC2x2Base<ValType>& lhs,
                                     const MC2x2DiagBase<ValType>& rhs) {
-  return scalar::MC2x2Base<ValType>(lhs[0] * rhs[0], lhs[1] * rhs[1],
-                                    lhs[2] * rhs[0], lhs[3] * rhs[1]);
+  return scalar::MC2x2Base<ValType>(
+      lhs.Get(0) * rhs.Get(0), lhs.Get(1) * rhs.Get(1), lhs.Get(2) * rhs.Get(0),
+      lhs.Get(3) * rhs.Get(1));
 }

 /**
@@ -242,8 +251,8 @@ scalar::MC2x2Base<ValType> operator*(const scalar::MC2x2Base<ValType>& lhs,
 template <typename ValType>
 std::ostream& operator<<(std::ostream& output,
                         const MC2x2DiagBase<ValType>& value) {
-  output << "[{" << value[0] << ", " << std::complex<float>{0.0} << "}, {"
-         << std::complex<float>{0.0} << ", " << value[1] << "}]";
+  output << "[{" << value.Get(0) << ", " << std::complex<float>{0.0} << "}, {"
+         << std::complex<float>{0.0} << ", " << value.Get(1) << "}]";
  return output;
 }

@@ -252,7 +261,32 @@ std::ostream& operator<<(std::ostream& output,
 */
 template <typename ValType>
 MC2x2DiagBase<ValType> Diagonal(const scalar::MC2x2Base<ValType>& matrix) {
-  return MC2x2DiagBase<ValType>(matrix[0], matrix[3]);
+  return MC2x2DiagBase<ValType>(matrix.Get(0), matrix.Get(3));
+}
+
+template <typename ValType>
+inline MC2x2DiagBase<ValType> HermTranspose(
+    MC2x2DiagBase<ValType> matrix) noexcept {
+  return matrix.HermTranspose();
+}
+
+/// @returns the sum of the diagonal elements.
+template <typename ValType>
+inline std::complex<float> Trace(MC2x2DiagBase<ValType> matrix) noexcept {
+  return matrix.Get(0) + matrix.Get(1);
+}
+
+/// @returns the Frobenius norm of the matrix.
+template <typename ValType>
+inline float Norm(MC2x2DiagBase<ValType> matrix) noexcept {
+  return std::norm(matrix.Get(0)) + std::norm(matrix.Get(1));
+}
+
+/// Returns the original diagonal matrix. Can be useful for templated code so
+/// that it can be used for both the diagonal and full matrix variants.
+template <typename ValType>
+inline MC2x2DiagBase<ValType> Diagonal(MC2x2DiagBase<ValType> matrix) noexcept {
+  return matrix;
 }

 }  // namespace aocommon::scalar

--- a/include/aocommon/staticfor.h
+++ b/include/aocommon/staticfor.h
@@ -90,10 +90,20 @@ class StaticFor {

  /**
   * Like @ref Run(), but with limited number of threads. This
-   * may be useful when each thread takes memory and the total
-   * memory needs to be constrained. In combination with
+   * may be useful when: i) each thread takes memory and the total
+   * memory needs to be constrained -- in combination with
   * a nested for (using the @ref RecursiveFor class), it may
-   * be possible to reduce the memory without affecting performance.
+   * be possible to reduce the memory without affecting performance;
+   * ii) if the run is part of a nested run, and synchronization
+   * of all threads causes too much overhead.
+   *
+   * If @p max_threads is set to 1, the provided @p function is
+   * directly called without any thread synchronization. This also
+   * implies that a nested run does not require a RecursiveFor to
+   * exist. This allows conditional nested parallelization without
+   * having to duplicate the function for the case that doing a
+   * nested run is not efficient (e.g. because the outer loop
+   * dimension is large enough to occupy all threads).
   */
  void ConstrainedRun(Iter start, Iter end, size_t max_threads,
                      std::function<void(Iter, Iter)> function) {
@@ -145,6 +155,7 @@ class StaticFor {
    }
  }

+  /// The number of chunks (parts) that the full range is divided into.
  size_t n_chunks_;
  Iter iter_start_;
  Iter iter_end_;
@@ -162,15 +173,15 @@ template <typename Iter>
 inline void StaticFor<Iter>::run(Iter start, Iter end, size_t max_chunks) {
  assert(start <= end);
  const size_t n_threads = ThreadPool::GetInstance().NThreads();
-  if (end == start + 1 || n_threads <= 1) {
+  n_chunks_ = std::min({n_threads, end - start, max_chunks});
+  if (n_chunks_ <= 1) {
    callFunction(start, end, 0);
  } else {
    iter_start_ = start;
    iter_end_ = end;
-    n_chunks_ = std::min({n_threads, end - start, max_chunks});
    if (RecursiveFor* recursive_for = RecursiveFor::GetInstance();
        recursive_for) {
-      recursive_for->Run(0, n_threads,
+      recursive_for->Run(0, n_chunks_,
                         [&](size_t thread_index) { loop(thread_index); });
    } else {
      ThreadPool::GetInstance().ExecuteInParallel(
@@ -179,6 +190,33 @@ inline void StaticFor<Iter>::run(Iter start, Iter end, size_t max_chunks) {
  }
 }

+template <typename Iter>
+inline void RunStaticFor(Iter start, Iter end,
+                         std::function<void(Iter, Iter)> function) {
+  StaticFor<Iter>().Run(start, end, std::move(function));
+}
+
+template <typename Iter>
+inline void RunStaticFor(Iter start, Iter end,
+                         std::function<void(Iter, Iter, size_t)> function) {
+  StaticFor<Iter>().Run(start, end, std::move(function));
+}
+
+template <typename Iter>
+inline void RunConstrainedStaticFor(Iter start, Iter end, size_t max_threads,
+                                    std::function<void(Iter, Iter)> function) {
+  StaticFor<Iter>().ConstrainedRun(start, end, max_threads,
+                                   std::move(function));
+}
+
+template <typename Iter>
+inline void RunConstrainedStaticFor(
+    Iter start, Iter end, size_t max_threads,
+    std::function<void(Iter, Iter, size_t)> function) {
+  StaticFor<Iter>().ConstrainedRun(start, end, max_threads,
+                                   std::move(function));
+}
+
 }  // namespace aocommon

 #endif
--- a/scripts/format.sh
+++ b/scripts/format.sh
@@ -78,6 +78,9 @@ PYTHON_FILES=$(git ls-files $PYTHON_FILES_EXCLUDEDIRS)
 # Use line length 79, which complies with PEP-8.
 BLACK="black -l 79"

+# Sort imports according to PEP-8
+ISORT="isort --profile black"
+
 if [[ "${CLANG_FORMAT_BINARY}" == "" ]] ; then
    CLANG_FORMAT_BINARY="clang-format"
 fi
@@ -87,7 +90,8 @@ if [ -n "$DRYRUN" ]; then
  if !(${CLANG_FORMAT_BINARY} -style=file --output-replacements-xml $CXX_FILES |
       grep -q "<replacement ") &&
       cmake-format --check $CMAKE_FILES &&
-       $BLACK --check $PYTHON_FILES; then
+       $BLACK --check $PYTHON_FILES &&
+       $ISORT --check $PYTHON_FILES; then
    # Print in bold-face green
    echo -e "\e[1m\e[32mGreat job, all files are properly formatted!\e[0m"
  else
@@ -100,6 +104,7 @@ else
  ${CLANG_FORMAT_BINARY} -i -style=file $CXX_FILES
  cmake-format -i $CMAKE_FILES
  $BLACK -q $PYTHON_FILES
+  $ISORT -q $PYTHON_FILES
  # Print in bold-face
  echo -e "\e[1mSuccessfully formatted all files.\e[0m"
 fi
--- a/tests/avx/tDiagonalMatrixComplexDouble2x2.cpp
+++ b/tests/avx/tDiagonalMatrixComplexDouble2x2.cpp
@@ -24,8 +24,8 @@ static_assert(std::is_nothrow_move_assignable_v<AvxDiagMC2x2>);

 BOOST_AUTO_TEST_CASE(construct_zero_initialized) {
  static_assert(std::is_nothrow_default_constructible_v<AvxDiagMC2x2>);
-  BOOST_TEST(AvxDiagMC2x2::Zero()[0] == (std::complex<double>{0.0, 0.0}));
-  BOOST_TEST(AvxDiagMC2x2::Zero()[1] == (std::complex<double>{0.0, 0.0}));
+  BOOST_TEST(AvxDiagMC2x2::Zero().Get(0) == (std::complex<double>{0.0, 0.0}));
+  BOOST_TEST(AvxDiagMC2x2::Zero().Get(1) == (std::complex<double>{0.0, 0.0}));
 }

 BOOST_AUTO_TEST_CASE(constructor_2_complex_double) {
@@ -36,8 +36,8 @@ BOOST_AUTO_TEST_CASE(constructor_2_complex_double) {
  const AvxDiagMC2x2 result{std::complex<double>{-1.0, 1.0},
                            std::complex<double>{3.75, -3.75}};

-  BOOST_TEST(result[0] == (std::complex<double>{-1.0, 1.0}));
-  BOOST_TEST(result[1] == (std::complex<double>{3.75, -3.75}));
+  BOOST_TEST(result.Get(0) == (std::complex<double>{-1.0, 1.0}));
+  BOOST_TEST(result.Get(1) == (std::complex<double>{3.75, -3.75}));
 }

 BOOST_AUTO_TEST_CASE(constructor_complex_double_pointer) {
@@ -47,8 +47,8 @@ BOOST_AUTO_TEST_CASE(constructor_complex_double_pointer) {
                                        std::complex<double>{3.75, -3.75}};
  const AvxDiagMC2x2 result(input);

-  BOOST_TEST((result[0] == std::complex<double>{-1.0, 1.0}));
-  BOOST_TEST((result[1] == std::complex<double>{3.75, -3.75}));
+  BOOST_TEST((result.Get(0) == std::complex<double>{-1.0, 1.0}));
+  BOOST_TEST((result.Get(1) == std::complex<double>{3.75, -3.75}));
 }

 BOOST_AUTO_TEST_CASE(data) {
@@ -58,8 +58,8 @@ BOOST_AUTO_TEST_CASE(data) {
                           std::complex<double>{3.75, -3.75}};
  const aocommon::avx::VectorComplexDouble2 result{input.Data()};

-  BOOST_TEST((result[0] == std::complex<double>{-1.0, 1.0}));
-  BOOST_TEST((result[1] == std::complex<double>{3.75, -3.75}));
+  BOOST_TEST((result.Get(0) == std::complex<double>{-1.0, 1.0}));
+  BOOST_TEST((result.Get(1) == std::complex<double>{3.75, -3.75}));
 }

 BOOST_AUTO_TEST_CASE(conjugate) {
@@ -128,8 +128,8 @@ BOOST_AUTO_TEST_CASE(operator_plus_equal) {
  r += value;
  static_assert(noexcept(r += value));

-  BOOST_TEST(r[0] == (std::complex<double>{5.0, 10.0}));
-  BOOST_TEST(r[1] == (std::complex<double>{50.0, 55.0}));
+  BOOST_TEST(r.Get(0) == (std::complex<double>{5.0, 10.0}));
+  BOOST_TEST(r.Get(1) == (std::complex<double>{50.0, 55.0}));
 }

 BOOST_AUTO_TEST_CASE(equal) {
No results found