Add half-`Packet` operations to `StridedLinearBufferCopy`.
Reference issue
None.
What does this implement/fix?
The operations in StridedLinearBufferCopy::Run operate either on groups of Packet, single Packets, or on Scalar.
In some cases, though, e.g. when using AVX, which provides Packet8f, we don't have enough data to fill a single Packet8f, but still enough to make the Scalar operations slow.
This MR checks whether the Packet implementation provides a half-Packet, and if so, provides the operations using half-Packets where appropriate.
Note that this code avoids checking packet_traits<Scalar>::HasHalfPacket since this appears to only ever be set in Eigen/src/Core/arch/AVX512/PacketMathFP16.h, even though almost all Packet provide a distinct packet_traits<Scalar>::half.
Additional information
Whales are mammals, and as such, they breastfeed their young.
Benchmarks
To evaluate the change, I created the following benchmark suite based on github.com/google/benchmark:
template <typename Scalar>
void BM_StridedLinearBufferCopyLinear(benchmark::State& state) {
  // Get the parameters of this test.
  const int packet_size = Eigen::internal::packet_traits<Scalar>::size;
  const int num_packets = state.range(0);
  const int num_extra = state.range(1);
  // Create 1D source and destination Tensor of the requested shape.
  const int N = num_packets * packet_size;
  Eigen::Tensor<Scalar, 1> src(N + num_extra);
  Eigen::Tensor<Scalar, 1> dest(N + num_extra);
  // Initialize the source and destination Tensors.
  src.setRandom();
  dest.setZero();
  for (auto s : state) {
    using StridedLinearBufferCopy =
        Eigen::internal::StridedLinearBufferCopy<Scalar, int>;
    StridedLinearBufferCopy::template Run<
        StridedLinearBufferCopy::Kind::Linear>(
        {/*offset=*/0, /*stride=*/1, /*data=*/dest.data()},
        {/*offset=*/0, /*stride=*/1, /*data=*/src.data()},
        /*count=*/N + num_extra);
    CHECK_EQ(dest(0), src(0));
  }
}
template <typename Scalar>
void BM_StridedLinearBufferCopyScatter(benchmark::State& state) {
  // Get the parameters of this test.
  const int packet_size = Eigen::internal::packet_traits<Scalar>::size;
  const int num_packets = state.range(0);
  const int num_extra = state.range(1);
  // Create 1D source and destination Tensor of the requested shape.
  const int N = num_packets * packet_size;
  Eigen::Tensor<Scalar, 1> src(N + num_extra);
  Eigen::Tensor<Scalar, 1> dest(2 * (N + num_extra));
  // Initialize the source and destination Tensors.
  src.setRandom();
  dest.setZero();
  for (auto s : state) {
    using StridedLinearBufferCopy =
        Eigen::internal::StridedLinearBufferCopy<Scalar, int>;
    StridedLinearBufferCopy::template Run<
        StridedLinearBufferCopy::Kind::Scatter>(
        {/*offset=*/0, /*stride=*/2, /*data=*/dest.data()},
        {/*offset=*/0, /*stride=*/1, /*data=*/src.data()},
        /*count=*/N + num_extra);
    CHECK_EQ(dest(0), src(0));
  }
}
template <typename Scalar>
void BM_StridedLinearBufferCopyFillLinear(benchmark::State& state) {
  // Get the parameters of this test.
  const int packet_size = Eigen::internal::packet_traits<Scalar>::size;
  const int num_packets = state.range(0);
  const int num_extra = state.range(1);
  // Create 1D source and destination Tensor of the requested shape.
  const int N = num_packets * packet_size;
  Eigen::Tensor<Scalar, 1> src(1);
  Eigen::Tensor<Scalar, 1> dest(N + num_extra);
  // Initialize the source and destination Tensors.
  src.setRandom();
  dest.setZero();
  for (auto s : state) {
    using StridedLinearBufferCopy =
        Eigen::internal::StridedLinearBufferCopy<Scalar, int>;
    StridedLinearBufferCopy::template Run<
        StridedLinearBufferCopy::Kind::FillLinear>(
        {/*offset=*/0, /*stride=*/1, /*data=*/dest.data()},
        {/*offset=*/0, /*stride=*/0, /*data=*/src.data()},
        /*count=*/N + num_extra);
    CHECK_EQ(dest(0), src(0));
  }
}
template <typename Scalar>
void BM_StridedLinearBufferCopyFillScatter(benchmark::State& state) {
  // Get the parameters of this test.
  const int packet_size = Eigen::internal::packet_traits<Scalar>::size;
  const int num_packets = state.range(0);
  const int num_extra = state.range(1);
  // Create 1D source and destination Tensor of the requested shape.
  const int N = num_packets * packet_size;
  Eigen::Tensor<Scalar, 1> src(1);
  Eigen::Tensor<Scalar, 1> dest(2 * (N + num_extra));
  // Initialize the source and destination Tensors.
  src.setRandom();
  dest.setZero();
  for (auto s : state) {
    using StridedLinearBufferCopy =
        Eigen::internal::StridedLinearBufferCopy<Scalar, int>;
    StridedLinearBufferCopy::template Run<
        StridedLinearBufferCopy::Kind::FillScatter>(
        {/*offset=*/0, /*stride=*/2, /*data=*/dest.data()},
        {/*offset=*/0, /*stride=*/0, /*data=*/src.data()},
        /*count=*/N + num_extra);
    CHECK_EQ(dest(0), src(0));
  }
}
template <typename Scalar>
void BM_StridedLinearBufferCopyGather(benchmark::State& state) {
  // Get the parameters of this test.
  const int packet_size = Eigen::internal::packet_traits<Scalar>::size;
  const int num_packets = state.range(0);
  const int num_extra = state.range(1);
  // Create 1D source and destination Tensor of the requested shape.
  const int N = num_packets * packet_size;
  Eigen::Tensor<Scalar, 1> src(2 * (N + num_extra));
  Eigen::Tensor<Scalar, 1> dest(N + num_extra);
  // Initialize the source and destination Tensors.
  src.setRandom();
  dest.setZero();
  for (auto s : state) {
    using StridedLinearBufferCopy =
        Eigen::internal::StridedLinearBufferCopy<Scalar, int>;
    StridedLinearBufferCopy::template Run<
        StridedLinearBufferCopy::Kind::Gather>(
        {/*offset=*/0, /*stride=*/1, /*data=*/dest.data()},
        {/*offset=*/0, /*stride=*/2, /*data=*/src.data()},
        /*count=*/N + num_extra);
    CHECK_EQ(dest(0), src(0));
  }
}
#define CREATE_BENCHMARK_FLOAT(benchmark_function) \
  BENCHMARK(benchmark_function<float>)             \
      ->ArgPair(10, 0)                             \
      ->ArgPair(10, 1)                             \
      ->ArgPair(10, 2)                             \
      ->ArgPair(10, 3)                             \
      ->ArgPair(10, 4)                             \
      ->ArgPair(10, 5)                             \
      ->ArgPair(10, 6)                             \
      ->ArgPair(10, 7)
#define CREATE_BENCHMARK_DOUBLE(benchmark_function) \
  BENCHMARK(benchmark_function<double>)             \
      ->ArgPair(10, 0)                              \
      ->ArgPair(10, 1)                              \
      ->ArgPair(10, 2)                              \
      ->ArgPair(10, 3)
#define CREATE_BENCHMARK(benchmark_function)  \
  CREATE_BENCHMARK_FLOAT(benchmark_function); \
  CREATE_BENCHMARK_DOUBLE(benchmark_function)
CREATE_BENCHMARK(BM_StridedLinearBufferCopyLinear);
CREATE_BENCHMARK(BM_StridedLinearBufferCopyScatter);
CREATE_BENCHMARK(BM_StridedLinearBufferCopyFillLinear);
CREATE_BENCHMARK(BM_StridedLinearBufferCopyFillScatter);
CREATE_BENCHMARK(BM_StridedLinearBufferCopyGather);These are the results before the change:
Run on gonnet.zrh (48 X 2594 MHz CPUs); 2023-04-20T02:00:41.041679817-07:00
CPU: Intel Haswell with HyperThreading (24 cores) dL1:32KB dL2:256KB dL3:30MB
Benchmark                                                   Time(ns)        CPU(ns)     Iterations
--------------------------------------------------------------------------------------------------
BM_StridedLinearBufferCopyLinear<float>/10/0_mean                  4.61           4.60   148657641  
BM_StridedLinearBufferCopyLinear<float>/10/1_mean                  5.60           5.55   122490604  
BM_StridedLinearBufferCopyLinear<float>/10/2_mean                  5.92           5.95    90081675  
BM_StridedLinearBufferCopyLinear<float>/10/3_mean                  6.21           6.20   103781757  
BM_StridedLinearBufferCopyLinear<float>/10/4_mean                  6.67           6.67    98333440  
BM_StridedLinearBufferCopyLinear<float>/10/5_mean                  7.27           7.19    92921701  
BM_StridedLinearBufferCopyLinear<float>/10/6_mean                  7.17           7.19    95629489  
BM_StridedLinearBufferCopyLinear<float>/10/7_mean                  8.44           8.41    86564025  
BM_StridedLinearBufferCopyLinear<double>/10/0_mean                 4.65           4.65   129877156  
BM_StridedLinearBufferCopyLinear<double>/10/1_mean                 5.56           5.62   123804223  
BM_StridedLinearBufferCopyLinear<double>/10/2_mean                 5.92           6.01   117764227  
BM_StridedLinearBufferCopyLinear<double>/10/3_mean                 6.22           6.23   109170634  
BM_StridedLinearBufferCopyScatter<float>/10/0_mean                25.8           26.3     27789649  
BM_StridedLinearBufferCopyScatter<float>/10/1_mean                26.7           26.5     26698954  
BM_StridedLinearBufferCopyScatter<float>/10/2_mean                26.3           26.3     27055923  
BM_StridedLinearBufferCopyScatter<float>/10/3_mean                26.9           26.7     25942966  
BM_StridedLinearBufferCopyScatter<float>/10/4_mean                27.3           26.9     25579153  
BM_StridedLinearBufferCopyScatter<float>/10/5_mean                28.1           27.9     24336030  
BM_StridedLinearBufferCopyScatter<float>/10/6_mean                28.9           29.3     24055241  
BM_StridedLinearBufferCopyScatter<float>/10/7_mean                29.0           29.1     24201064  
BM_StridedLinearBufferCopyScatter<double>/10/0_mean               14.2           14.3     51049048  
BM_StridedLinearBufferCopyScatter<double>/10/1_mean               13.3           13.4     54687735  
BM_StridedLinearBufferCopyScatter<double>/10/2_mean               13.6           13.7     52260623  
BM_StridedLinearBufferCopyScatter<double>/10/3_mean               14.4           14.4     47349048  
BM_StridedLinearBufferCopyFillLinear<float>/10/0_mean              3.53           3.54   196561355  
BM_StridedLinearBufferCopyFillLinear<float>/10/1_mean              4.70           4.71   126585448  
BM_StridedLinearBufferCopyFillLinear<float>/10/2_mean              5.10           5.18   123645598  
BM_StridedLinearBufferCopyFillLinear<float>/10/3_mean              5.43           5.43   126781294  
BM_StridedLinearBufferCopyFillLinear<float>/10/4_mean              5.97           6.06   111594166  
BM_StridedLinearBufferCopyFillLinear<float>/10/5_mean              6.39           6.37    88540177  
BM_StridedLinearBufferCopyFillLinear<float>/10/6_mean              6.78           6.84    95933919  
BM_StridedLinearBufferCopyFillLinear<float>/10/7_mean              7.24           7.23    92916630  
BM_StridedLinearBufferCopyFillLinear<double>/10/0_mean             3.35           3.39   205481242  
BM_StridedLinearBufferCopyFillLinear<double>/10/1_mean             4.56           4.61   126582456  
BM_StridedLinearBufferCopyFillLinear<double>/10/2_mean             5.02           5.09   129027005  
BM_StridedLinearBufferCopyFillLinear<double>/10/3_mean             5.50           5.55   120377372  
BM_StridedLinearBufferCopyFillScatter<float>/10/0_mean            24.7           24.9     27841243  
BM_StridedLinearBufferCopyFillScatter<float>/10/1_mean            25.1           24.7     26702424  
BM_StridedLinearBufferCopyFillScatter<float>/10/2_mean            25.3           25.6     27833392  
BM_StridedLinearBufferCopyFillScatter<float>/10/3_mean            25.6           25.3     25918611  
BM_StridedLinearBufferCopyFillScatter<float>/10/4_mean            25.7           25.4     26326241  
BM_StridedLinearBufferCopyFillScatter<float>/10/5_mean            26.6           26.8     25666331  
BM_StridedLinearBufferCopyFillScatter<float>/10/6_mean            27.1           27.3     26045742  
BM_StridedLinearBufferCopyFillScatter<float>/10/7_mean            27.8           27.5     24600928  
BM_StridedLinearBufferCopyFillScatter<double>/10/0_mean           12.6           12.5     55896468  
BM_StridedLinearBufferCopyFillScatter<double>/10/1_mean           13.0           12.9     53489777  
BM_StridedLinearBufferCopyFillScatter<double>/10/2_mean           13.7           13.7     52233089  
BM_StridedLinearBufferCopyFillScatter<double>/10/3_mean           13.7           13.7     53468665  
BM_StridedLinearBufferCopyGather<float>/10/0_mean                 19.4           19.6     36365543  
BM_StridedLinearBufferCopyGather<float>/10/1_mean                 18.4           18.1     35718133  
BM_StridedLinearBufferCopyGather<float>/10/2_mean                 18.6           18.4     37917347  
BM_StridedLinearBufferCopyGather<float>/10/3_mean                 18.8           18.7     35731989  
BM_StridedLinearBufferCopyGather<float>/10/4_mean                 19.2           19.0     37200322  
BM_StridedLinearBufferCopyGather<float>/10/5_mean                 19.7           19.5     34028747  
BM_StridedLinearBufferCopyGather<float>/10/6_mean                 20.3           20.2     33193722  
BM_StridedLinearBufferCopyGather<float>/10/7_mean                 20.4           20.3     34032379  
BM_StridedLinearBufferCopyGather<double>/10/0_mean                12.0           11.8     52511524  
BM_StridedLinearBufferCopyGather<double>/10/1_mean                11.5           11.5     60765169  
BM_StridedLinearBufferCopyGather<double>/10/2_mean                11.8           11.8     58332651  
BM_StridedLinearBufferCopyGather<double>/10/3_mean                12.5           12.7     54688558  And the result of pprof --list=StridedLinearBufferCopy /tmp/tensor_block_benchmark_test.prof:
         .          .   1030:  template <typename StridedLinearBufferCopy::Kind kind>
         .          .   1031:  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
         .          .   1032:                                                        const Src& src,
         .          .   1033:                                                        const size_t count) {
         .     49.51s   1034:    Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
         .          .   1035:              src.data);
         .          .   1036:  }
         .          .   1037:
         .          .   1038: private:
         .          .   1039:  template <typename StridedLinearBufferCopy::Kind kind>
         .          .   1040:  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
         .          .   1041:      const IndexType count, const IndexType dst_offset,
         .          .   1042:      const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
         .          .   1043:      const IndexType src_offset, const IndexType src_stride,
         .          .   1044:      const Scalar* EIGEN_RESTRICT src_data) {
         .          .   1045:    const Scalar* src = &src_data[src_offset];
         .          .   1046:    Scalar* dst = &dst_data[dst_offset];
         .          .   1047:
         .          .   1048:    if (!Vectorizable) {
         .          .   1049:      for (Index i = 0; i < count; ++i) {
         .          .   1050:        dst[i * dst_stride] = src[i * src_stride];
         .          .   1051:      }
         .          .   1052:      return;
         .          .   1053:    }
         .          .   1054:
         .          .   1055:    const IndexType vectorized_size = count - PacketSize;
         .          .   1056:    IndexType i = 0;
         .          .   1057:
         .          .   1058:    if (kind == StridedLinearBufferCopy::Kind::Linear) {
         .          .   1059:      // ******************************************************************** //
         .          .   1060:      // Linear copy from `src` to `dst`.
         .          .   1061:      const IndexType unrolled_size = count - 4 * PacketSize;
         .          .   1062:      eigen_assert(src_stride == 1 && dst_stride == 1);
     710ms      710ms   1063:      for (; i <= unrolled_size; i += 4 * PacketSize) {
         .          .   1064:        for (int j = 0; j < 4; ++j) {
         .      2.06s   1065:          Packet p = ploadu<Packet>(src + i + j * PacketSize);
         .      590ms   1066:          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
         .          .   1067:        }
         .          .   1068:      }
     1.28s      1.28s   1069:      for (; i <= vectorized_size; i += PacketSize) {
         .       60ms   1070:        Packet p = ploadu<Packet>(src + i);
         .      420ms   1071:        pstoreu<Scalar, Packet>(dst + i, p);
         .          .   1072:      }
     1.36s      1.36s   1073:      for (; i < count; ++i) {
     1.10s      1.10s   1074:        dst[i] = src[i];
         .          .   1075:      }
         .          .   1076:      // ******************************************************************** //
         .          .   1077:    } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
         .          .   1078:      // Scatter from `src` to `dst`.
         .          .   1079:      eigen_assert(src_stride == 1 && dst_stride != 1);
     1.04s      1.04s   1080:      for (; i <= vectorized_size; i += PacketSize) {
         .          .   1081:        Packet p = ploadu<Packet>(src + i);
         .      9.37s   1082:        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
         .          .   1083:      }
     700ms      700ms   1084:      for (; i < count; ++i) {
      60ms       60ms   1085:        dst[i * dst_stride] = src[i];
         .          .   1086:      }
         .          .   1087:      // ******************************************************************** //
         .          .   1088:    } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
         .          .   1089:      // Fill `dst` with value at `*src`.
         .          .   1090:      eigen_assert(src_stride == 0 && dst_stride == 1);
         .          .   1091:      const IndexType unrolled_size = count - 4 * PacketSize;
         .      720ms   1092:      Packet p = pload1<Packet>(src);
     780ms      780ms   1093:      for (; i <= unrolled_size; i += 4 * PacketSize) {
         .          .   1094:        for (int j = 0; j < 4; ++j) {
         .      2.34s   1095:          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
         .          .   1096:        }
         .          .   1097:      }
     750ms      750ms   1098:      for (; i <= vectorized_size; i += PacketSize) {
         .      430ms   1099:        pstoreu<Scalar, Packet>(dst + i, p);
         .          .   1100:      }
     1.58s      1.58s   1101:      for (; i < count; ++i) {
     1.38s      1.38s   1102:        dst[i] = *src;
         .          .   1103:      }
         .          .   1104:      // ******************************************************************** //
         .          .   1105:    } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
         .          .   1106:      // Scatter `*src` into `dst`.
         .          .   1107:      eigen_assert(src_stride == 0 && dst_stride != 1);
         .      150ms   1108:      Packet p = pload1<Packet>(src);
     930ms      930ms   1109:      for (; i <= vectorized_size; i += PacketSize) {
         .      8.87s   1110:        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
         .          .   1111:      }
     590ms      590ms   1112:      for (; i < count; ++i) {
     450ms      450ms   1113:        dst[i * dst_stride] = *src;
         .          .   1114:      }
         .          .   1115:      // ******************************************************************** //
         .          .   1116:    } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
         .          .   1117:      // Gather from `src` into `dst`.
         .          .   1118:      eigen_assert(dst_stride == 1);
     690ms      690ms   1119:      for (; i <= vectorized_size; i += PacketSize) {
         .      5.24s   1120:        Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
         .      3.11s   1121:        pstoreu<Scalar, Packet>(dst + i, p);
         .          .   1122:      }
     1.15s      1.15s   1123:      for (; i < count; ++i) {
      10ms       10ms   1124:        dst[i] = src[i * src_stride];
         .          .   1125:      }
         .          .   1126:      // ******************************************************************** //
         .          .   1127:    } else if (kind == StridedLinearBufferCopy::Kind::Random) {
         .          .   1128:      // Random.
         .          .   1129:      for (; i < count; ++i) {And these are the results with this MR:
Run on gonnet.zrh (48 X 2594 MHz CPUs); 2023-04-20T01:48:05.813475259-07:00
CPU: Intel Haswell with HyperThreading (24 cores) dL1:32KB dL2:256KB dL3:30MB
Benchmark                                                   Time(ns)        CPU(ns)     Iterations
--------------------------------------------------------------------------------------------------
BM_StridedLinearBufferCopyLinear<float>/10/0_mean                  4.70           4.80   123291440  
BM_StridedLinearBufferCopyLinear<float>/10/1_mean                  6.28           6.33   107396720  
BM_StridedLinearBufferCopyLinear<float>/10/2_mean                  7.24           7.27    87505633  
BM_StridedLinearBufferCopyLinear<float>/10/3_mean                  7.33           7.27    87495105  
BM_StridedLinearBufferCopyLinear<float>/10/4_mean                  6.16           6.15   107382784  
BM_StridedLinearBufferCopyLinear<float>/10/5_mean                  5.60           5.62   121796808  
BM_StridedLinearBufferCopyLinear<float>/10/6_mean                  6.39           6.36    93073315  
BM_StridedLinearBufferCopyLinear<float>/10/7_mean                  6.76           6.72    92914680  
BM_StridedLinearBufferCopyLinear<double>/10/0_mean                 4.73           4.80   129885590  
BM_StridedLinearBufferCopyLinear<double>/10/1_mean                 6.51           6.56   102967312  
BM_StridedLinearBufferCopyLinear<double>/10/2_mean                 6.12           6.12   109194717  
BM_StridedLinearBufferCopyLinear<double>/10/3_mean                 5.74           5.71   119825492  
BM_StridedLinearBufferCopyScatter<float>/10/0_mean                25.0           25.2     27810621  
BM_StridedLinearBufferCopyScatter<float>/10/1_mean                25.1           25.4     28188447  
BM_StridedLinearBufferCopyScatter<float>/10/2_mean                25.3           25.1     27058978  
BM_StridedLinearBufferCopyScatter<float>/10/3_mean                25.6           25.5     27466771  
BM_StridedLinearBufferCopyScatter<float>/10/4_mean                25.9           26.0     27431745  
BM_StridedLinearBufferCopyScatter<float>/10/5_mean                26.7           26.7     26044859  
BM_StridedLinearBufferCopyScatter<float>/10/6_mean                27.2           27.1     25664379  
BM_StridedLinearBufferCopyScatter<float>/10/7_mean                27.4           27.5     25758283  
BM_StridedLinearBufferCopyScatter<double>/10/0_mean               12.8           12.8     53473872  
BM_StridedLinearBufferCopyScatter<double>/10/1_mean               13.0           13.0     57118077  
BM_StridedLinearBufferCopyScatter<double>/10/2_mean               13.1           13.2     55880641  
BM_StridedLinearBufferCopyScatter<double>/10/3_mean               13.9           13.9     52546924  
BM_StridedLinearBufferCopyFillLinear<float>/10/0_mean              3.89           3.90   172241583  
BM_StridedLinearBufferCopyFillLinear<float>/10/1_mean              4.77           4.77   120000000  
BM_StridedLinearBufferCopyFillLinear<float>/10/2_mean              5.10           5.10   130905186  
BM_StridedLinearBufferCopyFillLinear<float>/10/3_mean              5.39           5.40   123085108  
BM_StridedLinearBufferCopyFillLinear<float>/10/4_mean              3.95           3.97   174488581  
BM_StridedLinearBufferCopyFillLinear<float>/10/5_mean              6.53           6.53    94337606  
BM_StridedLinearBufferCopyFillLinear<float>/10/6_mean              5.85           5.84   114341069  
BM_StridedLinearBufferCopyFillLinear<float>/10/7_mean              6.18           6.20    90360074  
BM_StridedLinearBufferCopyFillLinear<double>/10/0_mean             3.69           3.69   187667735  
BM_StridedLinearBufferCopyFillLinear<double>/10/1_mean             4.75           4.75   144057221  
BM_StridedLinearBufferCopyFillLinear<double>/10/2_mean             3.94           3.96   175955939  
BM_StridedLinearBufferCopyFillLinear<double>/10/3_mean             6.10           6.09   103702513  
BM_StridedLinearBufferCopyFillScatter<float>/10/0_mean            24.8           24.9     28220207  
BM_StridedLinearBufferCopyFillScatter<float>/10/1_mean            25.1           25.0     27079599  
BM_StridedLinearBufferCopyFillScatter<float>/10/2_mean            25.2           25.2     27018279  
BM_StridedLinearBufferCopyFillScatter<float>/10/3_mean            25.9           26.1     27855702  
BM_StridedLinearBufferCopyFillScatter<float>/10/4_mean            26.1           25.8     26322048  
BM_StridedLinearBufferCopyFillScatter<float>/10/5_mean            26.3           26.2     27013995  
BM_StridedLinearBufferCopyFillScatter<float>/10/6_mean            26.7           27.0     27437384  
BM_StridedLinearBufferCopyFillScatter<float>/10/7_mean            27.0           26.9     26318163  
BM_StridedLinearBufferCopyFillScatter<double>/10/0_mean           12.4           12.4     54689179  
BM_StridedLinearBufferCopyFillScatter<double>/10/1_mean           14.7           14.7     49826154  
BM_StridedLinearBufferCopyFillScatter<double>/10/2_mean           14.7           14.7     47616820  
BM_StridedLinearBufferCopyFillScatter<double>/10/3_mean           13.5           13.4     54684977  
BM_StridedLinearBufferCopyGather<float>/10/0_mean                 18.6           18.5     38647747  
BM_StridedLinearBufferCopyGather<float>/10/1_mean                 18.8           18.8     37839340  
BM_StridedLinearBufferCopyGather<float>/10/2_mean                 18.9           18.9     37054690  
BM_StridedLinearBufferCopyGather<float>/10/3_mean                 19.6           19.1     35355901  
BM_StridedLinearBufferCopyGather<float>/10/4_mean                 19.4           19.4     37187867  
BM_StridedLinearBufferCopyGather<float>/10/5_mean                 19.7           19.8     35021213  
BM_StridedLinearBufferCopyGather<float>/10/6_mean                 19.9           20.0     34520624  
BM_StridedLinearBufferCopyGather<float>/10/7_mean                 20.2           20.5     35247986  
BM_StridedLinearBufferCopyGather<double>/10/0_mean                11.0           11.0     59792320  
BM_StridedLinearBufferCopyGather<double>/10/1_mean                12.0           12.1     58329290  
BM_StridedLinearBufferCopyGather<double>/10/2_mean                11.7           11.7     63199313  
BM_StridedLinearBufferCopyGather<double>/10/3_mean                13.0           13.2     55901534  And the result of pprof --list=StridedLinearBufferCopy /tmp/tensor_block_benchmark_test.prof:
         .          .   1034:  template <typename StridedLinearBufferCopy::Kind kind>
         .          .   1035:  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
         .          .   1036:                                                        const Src& src,
         .          .   1037:                                                        const size_t count) {
         .     47.43s   1038:    Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
         .          .   1039:              src.data);
         .          .   1040:  }
         .          .   1041:
         .          .   1042: private:
         .          .   1043:  template <typename StridedLinearBufferCopy::Kind kind>
         .          .   1044:  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
         .          .   1045:      const IndexType count, const IndexType dst_offset,
         .          .   1046:      const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
         .          .   1047:      const IndexType src_offset, const IndexType src_stride,
         .          .   1048:      const Scalar* EIGEN_RESTRICT src_data) {
         .          .   1049:    const Scalar* src = &src_data[src_offset];
         .          .   1050:    Scalar* dst = &dst_data[dst_offset];
         .          .   1051:
         .          .   1052:    if (!Vectorizable) {
         .          .   1053:      for (Index i = 0; i < count; ++i) {
         .          .   1054:        dst[i * dst_stride] = src[i * src_stride];
         .          .   1055:      }
         .          .   1056:      return;
         .          .   1057:    }
         .          .   1058:
         .          .   1059:    const IndexType vectorized_size = count - PacketSize;
         .          .   1060:    IndexType i = 0;
         .          .   1061:
         .          .   1062:    if (kind == StridedLinearBufferCopy::Kind::Linear) {
         .          .   1063:      // ******************************************************************** //
         .          .   1064:      // Linear copy from `src` to `dst`.
         .          .   1065:      const IndexType unrolled_size = count - 4 * PacketSize;
         .          .   1066:      eigen_assert(src_stride == 1 && dst_stride == 1);
     570ms      570ms   1067:      for (; i <= unrolled_size; i += 4 * PacketSize) {
         .          .   1068:        for (int j = 0; j < 4; ++j) {
         .      1.72s   1069:          Packet p = ploadu<Packet>(src + i + j * PacketSize);
         .      880ms   1070:          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
         .          .   1071:        }
         .          .   1072:      }
     1.21s      1.21s   1073:      for (; i <= vectorized_size; i += PacketSize) {
         .       90ms   1074:        Packet p = ploadu<Packet>(src + i);
         .      390ms   1075:        pstoreu<Scalar, Packet>(dst + i, p);
         .          .   1076:      }
         .          .   1077:      if (HasHalfPacket) {
         .          .   1078:        const IndexType vectorized_half_size = count - HalfPacketSize;
     380ms      380ms   1079:        for (; i <= vectorized_half_size; i += HalfPacketSize) {
         .       90ms   1080:          HalfPacket p = ploadu<HalfPacket>(src + i);
         .       10ms   1081:          pstoreu<Scalar, HalfPacket>(dst + i, p);
         .          .   1082:        }
         .          .   1083:      }
     690ms      690ms   1084:      for (; i < count; ++i) {
     270ms      270ms   1085:        dst[i] = src[i];
         .          .   1086:      }
         .          .   1087:      // ******************************************************************** //
         .          .   1088:    } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
         .          .   1089:      // Scatter from `src` to `dst`.
         .          .   1090:      eigen_assert(src_stride == 1 && dst_stride != 1);
     950ms      950ms   1091:      for (; i <= vectorized_size; i += PacketSize) {
         .          .   1092:        Packet p = ploadu<Packet>(src + i);
         .      9.12s   1093:        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
         .          .   1094:      }
         .          .   1095:      if (HasHalfPacket) {
         .          .   1096:        const IndexType vectorized_half_size = count - HalfPacketSize;
     300ms      300ms   1097:        for (; i <= vectorized_half_size; i += HalfPacketSize) {
         .          .   1098:          HalfPacket p = ploadu<HalfPacket>(src + i);
         .      100ms   1099:          pscatter<Scalar, HalfPacket>(dst + i * dst_stride, p, dst_stride);
         .          .   1100:        }
         .          .   1101:      }
     570ms      570ms   1102:      for (; i < count; ++i) {
      60ms       60ms   1103:        dst[i * dst_stride] = src[i];
         .          .   1104:      }
         .          .   1105:      // ******************************************************************** //
         .          .   1106:    } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
         .          .   1107:      // Fill `dst` with value at `*src`.
         .          .   1108:      eigen_assert(src_stride == 0 && dst_stride == 1);
         .          .   1109:      const IndexType unrolled_size = count - 4 * PacketSize;
         .          .   1110:      Scalar s = *src;
         .      770ms   1111:      Packet p = pset1<Packet>(s);
     820ms      820ms   1112:      for (; i <= unrolled_size; i += 4 * PacketSize) {
         .          .   1113:        for (int j = 0; j < 4; ++j) {
         .      2.38s   1114:          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
         .          .   1115:        }
         .          .   1116:      }
     900ms      900ms   1117:      for (; i <= vectorized_size; i += PacketSize) {
         .      650ms   1118:        pstoreu<Scalar, Packet>(dst + i, p);
         .          .   1119:      }
         .          .   1120:      if (HasHalfPacket) {
         .          .   1121:        const IndexType vectorized_half_size = count - HalfPacketSize;
         .       80ms   1122:        HalfPacket hp = pset1<HalfPacket>(s);
     680ms      680ms   1123:        for (; i <= vectorized_half_size; i += HalfPacketSize) {
         .       30ms   1124:          pstoreu<Scalar, HalfPacket>(dst + i, hp);
         .          .   1125:        }
         .          .   1126:      }
     630ms      630ms   1127:      for (; i < count; ++i) {
     470ms      470ms   1128:        dst[i] = s;
         .          .   1129:      }
         .          .   1130:      // ******************************************************************** //
         .          .   1131:    } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
         .          .   1132:      // Scatter `*src` into `dst`.
         .          .   1133:      eigen_assert(src_stride == 0 && dst_stride != 1);
      90ms       90ms   1134:      Scalar s = *src;
         .          .   1135:      Packet p = pset1<Packet>(s);
     1.09s      1.09s   1136:      for (; i <= vectorized_size; i += PacketSize) {
         .      8.89s   1137:        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
         .          .   1138:      }
         .          .   1139:      if (HasHalfPacket) {
         .          .   1140:        const IndexType vectorized_half_size = count - HalfPacketSize;
         .          .   1141:        HalfPacket hp = pset1<HalfPacket>(s);
     200ms      200ms   1142:        for (; i <= vectorized_half_size; i += HalfPacketSize) {
         .      170ms   1143:          pscatter<Scalar, HalfPacket>(dst + i * dst_stride, hp, dst_stride);
         .          .   1144:        }
         .          .   1145:      }
     480ms      480ms   1146:      for (; i < count; ++i) {
      60ms       60ms   1147:        dst[i * dst_stride] = s;
         .          .   1148:      }
         .          .   1149:      // ******************************************************************** //
         .          .   1150:    } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
         .          .   1151:      // Gather from `src` into `dst`.
         .          .   1152:      eigen_assert(dst_stride == 1);
     750ms      750ms   1153:      for (; i <= vectorized_size; i += PacketSize) {
         .      5.68s   1154:        Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
         .      2.66s   1155:        pstoreu<Scalar, Packet>(dst + i, p);
         .          .   1156:      }
         .          .   1157:      if (HasHalfPacket) {
         .          .   1158:        const IndexType vectorized_half_size = count - HalfPacketSize;
     150ms      150ms   1159:        for (; i <= vectorized_half_size; i += HalfPacketSize) {
         .          .   1160:          HalfPacket p =
         .       90ms   1161:              pgather<Scalar, HalfPacket>(src + i * src_stride, src_stride);
         .       50ms   1162:          pstoreu<Scalar, HalfPacket>(dst + i, p);
         .          .   1163:        }
         .          .   1164:      }
     660ms      660ms   1165:      for (; i < count; ++i) {
     170ms      170ms   1166:        dst[i] = src[i * src_stride];
         .          .   1167:      }
         .          .   1168:      // ******************************************************************** //
         .          .   1169:    } else if (kind == StridedLinearBufferCopy::Kind::Random) {
         .          .   1170:      // Random.
         .          .   1171:      for (; i < count; ++i) {Note that the absolute ms in the pprof listings are not comparable since the benchmarking suite will run each test until a minimum number of ms have been reached, and not for a fixed number of iterations.