Add half-`Packet` operations to `StridedLinearBufferCopy`.
Reference issue
None.
What does this implement/fix?
The operations in StridedLinearBufferCopy::Run operate either on groups of Packet, single Packets, or on Scalar.
In some cases, though, e.g. when using AVX, which provides Packet8f, we don't have enough data to fill a single Packet8f, but still enough to make the Scalar operations slow.
This MR checks whether the Packet implementation provides a half-Packet, and if so, provides the operations using half-Packets where appropriate.
Note that this code avoids checking packet_traits<Scalar>::HasHalfPacket since this appears to only ever be set in Eigen/src/Core/arch/AVX512/PacketMathFP16.h, even though almost all Packet provide a distinct packet_traits<Scalar>::half.
Additional information
Whales are mammals, and as such, they breastfeed their young.
Benchmarks
To evaluate the change, I created the following benchmark suite based on github.com/google/benchmark:
template <typename Scalar>
void BM_StridedLinearBufferCopyLinear(benchmark::State& state) {
// Get the parameters of this test.
const int packet_size = Eigen::internal::packet_traits<Scalar>::size;
const int num_packets = state.range(0);
const int num_extra = state.range(1);
// Create 1D source and destination Tensor of the requested shape.
const int N = num_packets * packet_size;
Eigen::Tensor<Scalar, 1> src(N + num_extra);
Eigen::Tensor<Scalar, 1> dest(N + num_extra);
// Initialize the source and destination Tensors.
src.setRandom();
dest.setZero();
for (auto s : state) {
using StridedLinearBufferCopy =
Eigen::internal::StridedLinearBufferCopy<Scalar, int>;
StridedLinearBufferCopy::template Run<
StridedLinearBufferCopy::Kind::Linear>(
{/*offset=*/0, /*stride=*/1, /*data=*/dest.data()},
{/*offset=*/0, /*stride=*/1, /*data=*/src.data()},
/*count=*/N + num_extra);
CHECK_EQ(dest(0), src(0));
}
}
template <typename Scalar>
void BM_StridedLinearBufferCopyScatter(benchmark::State& state) {
// Get the parameters of this test.
const int packet_size = Eigen::internal::packet_traits<Scalar>::size;
const int num_packets = state.range(0);
const int num_extra = state.range(1);
// Create 1D source and destination Tensor of the requested shape.
const int N = num_packets * packet_size;
Eigen::Tensor<Scalar, 1> src(N + num_extra);
Eigen::Tensor<Scalar, 1> dest(2 * (N + num_extra));
// Initialize the source and destination Tensors.
src.setRandom();
dest.setZero();
for (auto s : state) {
using StridedLinearBufferCopy =
Eigen::internal::StridedLinearBufferCopy<Scalar, int>;
StridedLinearBufferCopy::template Run<
StridedLinearBufferCopy::Kind::Scatter>(
{/*offset=*/0, /*stride=*/2, /*data=*/dest.data()},
{/*offset=*/0, /*stride=*/1, /*data=*/src.data()},
/*count=*/N + num_extra);
CHECK_EQ(dest(0), src(0));
}
}
template <typename Scalar>
void BM_StridedLinearBufferCopyFillLinear(benchmark::State& state) {
// Get the parameters of this test.
const int packet_size = Eigen::internal::packet_traits<Scalar>::size;
const int num_packets = state.range(0);
const int num_extra = state.range(1);
// Create 1D source and destination Tensor of the requested shape.
const int N = num_packets * packet_size;
Eigen::Tensor<Scalar, 1> src(1);
Eigen::Tensor<Scalar, 1> dest(N + num_extra);
// Initialize the source and destination Tensors.
src.setRandom();
dest.setZero();
for (auto s : state) {
using StridedLinearBufferCopy =
Eigen::internal::StridedLinearBufferCopy<Scalar, int>;
StridedLinearBufferCopy::template Run<
StridedLinearBufferCopy::Kind::FillLinear>(
{/*offset=*/0, /*stride=*/1, /*data=*/dest.data()},
{/*offset=*/0, /*stride=*/0, /*data=*/src.data()},
/*count=*/N + num_extra);
CHECK_EQ(dest(0), src(0));
}
}
template <typename Scalar>
void BM_StridedLinearBufferCopyFillScatter(benchmark::State& state) {
// Get the parameters of this test.
const int packet_size = Eigen::internal::packet_traits<Scalar>::size;
const int num_packets = state.range(0);
const int num_extra = state.range(1);
// Create 1D source and destination Tensor of the requested shape.
const int N = num_packets * packet_size;
Eigen::Tensor<Scalar, 1> src(1);
Eigen::Tensor<Scalar, 1> dest(2 * (N + num_extra));
// Initialize the source and destination Tensors.
src.setRandom();
dest.setZero();
for (auto s : state) {
using StridedLinearBufferCopy =
Eigen::internal::StridedLinearBufferCopy<Scalar, int>;
StridedLinearBufferCopy::template Run<
StridedLinearBufferCopy::Kind::FillScatter>(
{/*offset=*/0, /*stride=*/2, /*data=*/dest.data()},
{/*offset=*/0, /*stride=*/0, /*data=*/src.data()},
/*count=*/N + num_extra);
CHECK_EQ(dest(0), src(0));
}
}
template <typename Scalar>
void BM_StridedLinearBufferCopyGather(benchmark::State& state) {
// Get the parameters of this test.
const int packet_size = Eigen::internal::packet_traits<Scalar>::size;
const int num_packets = state.range(0);
const int num_extra = state.range(1);
// Create 1D source and destination Tensor of the requested shape.
const int N = num_packets * packet_size;
Eigen::Tensor<Scalar, 1> src(2 * (N + num_extra));
Eigen::Tensor<Scalar, 1> dest(N + num_extra);
// Initialize the source and destination Tensors.
src.setRandom();
dest.setZero();
for (auto s : state) {
using StridedLinearBufferCopy =
Eigen::internal::StridedLinearBufferCopy<Scalar, int>;
StridedLinearBufferCopy::template Run<
StridedLinearBufferCopy::Kind::Gather>(
{/*offset=*/0, /*stride=*/1, /*data=*/dest.data()},
{/*offset=*/0, /*stride=*/2, /*data=*/src.data()},
/*count=*/N + num_extra);
CHECK_EQ(dest(0), src(0));
}
}
#define CREATE_BENCHMARK_FLOAT(benchmark_function) \
BENCHMARK(benchmark_function<float>) \
->ArgPair(10, 0) \
->ArgPair(10, 1) \
->ArgPair(10, 2) \
->ArgPair(10, 3) \
->ArgPair(10, 4) \
->ArgPair(10, 5) \
->ArgPair(10, 6) \
->ArgPair(10, 7)
#define CREATE_BENCHMARK_DOUBLE(benchmark_function) \
BENCHMARK(benchmark_function<double>) \
->ArgPair(10, 0) \
->ArgPair(10, 1) \
->ArgPair(10, 2) \
->ArgPair(10, 3)
#define CREATE_BENCHMARK(benchmark_function) \
CREATE_BENCHMARK_FLOAT(benchmark_function); \
CREATE_BENCHMARK_DOUBLE(benchmark_function)
CREATE_BENCHMARK(BM_StridedLinearBufferCopyLinear);
CREATE_BENCHMARK(BM_StridedLinearBufferCopyScatter);
CREATE_BENCHMARK(BM_StridedLinearBufferCopyFillLinear);
CREATE_BENCHMARK(BM_StridedLinearBufferCopyFillScatter);
CREATE_BENCHMARK(BM_StridedLinearBufferCopyGather);
These are the results before the change:
Run on gonnet.zrh (48 X 2594 MHz CPUs); 2023-04-20T02:00:41.041679817-07:00
CPU: Intel Haswell with HyperThreading (24 cores) dL1:32KB dL2:256KB dL3:30MB
Benchmark Time(ns) CPU(ns) Iterations
--------------------------------------------------------------------------------------------------
BM_StridedLinearBufferCopyLinear<float>/10/0_mean 4.61 4.60 148657641
BM_StridedLinearBufferCopyLinear<float>/10/1_mean 5.60 5.55 122490604
BM_StridedLinearBufferCopyLinear<float>/10/2_mean 5.92 5.95 90081675
BM_StridedLinearBufferCopyLinear<float>/10/3_mean 6.21 6.20 103781757
BM_StridedLinearBufferCopyLinear<float>/10/4_mean 6.67 6.67 98333440
BM_StridedLinearBufferCopyLinear<float>/10/5_mean 7.27 7.19 92921701
BM_StridedLinearBufferCopyLinear<float>/10/6_mean 7.17 7.19 95629489
BM_StridedLinearBufferCopyLinear<float>/10/7_mean 8.44 8.41 86564025
BM_StridedLinearBufferCopyLinear<double>/10/0_mean 4.65 4.65 129877156
BM_StridedLinearBufferCopyLinear<double>/10/1_mean 5.56 5.62 123804223
BM_StridedLinearBufferCopyLinear<double>/10/2_mean 5.92 6.01 117764227
BM_StridedLinearBufferCopyLinear<double>/10/3_mean 6.22 6.23 109170634
BM_StridedLinearBufferCopyScatter<float>/10/0_mean 25.8 26.3 27789649
BM_StridedLinearBufferCopyScatter<float>/10/1_mean 26.7 26.5 26698954
BM_StridedLinearBufferCopyScatter<float>/10/2_mean 26.3 26.3 27055923
BM_StridedLinearBufferCopyScatter<float>/10/3_mean 26.9 26.7 25942966
BM_StridedLinearBufferCopyScatter<float>/10/4_mean 27.3 26.9 25579153
BM_StridedLinearBufferCopyScatter<float>/10/5_mean 28.1 27.9 24336030
BM_StridedLinearBufferCopyScatter<float>/10/6_mean 28.9 29.3 24055241
BM_StridedLinearBufferCopyScatter<float>/10/7_mean 29.0 29.1 24201064
BM_StridedLinearBufferCopyScatter<double>/10/0_mean 14.2 14.3 51049048
BM_StridedLinearBufferCopyScatter<double>/10/1_mean 13.3 13.4 54687735
BM_StridedLinearBufferCopyScatter<double>/10/2_mean 13.6 13.7 52260623
BM_StridedLinearBufferCopyScatter<double>/10/3_mean 14.4 14.4 47349048
BM_StridedLinearBufferCopyFillLinear<float>/10/0_mean 3.53 3.54 196561355
BM_StridedLinearBufferCopyFillLinear<float>/10/1_mean 4.70 4.71 126585448
BM_StridedLinearBufferCopyFillLinear<float>/10/2_mean 5.10 5.18 123645598
BM_StridedLinearBufferCopyFillLinear<float>/10/3_mean 5.43 5.43 126781294
BM_StridedLinearBufferCopyFillLinear<float>/10/4_mean 5.97 6.06 111594166
BM_StridedLinearBufferCopyFillLinear<float>/10/5_mean 6.39 6.37 88540177
BM_StridedLinearBufferCopyFillLinear<float>/10/6_mean 6.78 6.84 95933919
BM_StridedLinearBufferCopyFillLinear<float>/10/7_mean 7.24 7.23 92916630
BM_StridedLinearBufferCopyFillLinear<double>/10/0_mean 3.35 3.39 205481242
BM_StridedLinearBufferCopyFillLinear<double>/10/1_mean 4.56 4.61 126582456
BM_StridedLinearBufferCopyFillLinear<double>/10/2_mean 5.02 5.09 129027005
BM_StridedLinearBufferCopyFillLinear<double>/10/3_mean 5.50 5.55 120377372
BM_StridedLinearBufferCopyFillScatter<float>/10/0_mean 24.7 24.9 27841243
BM_StridedLinearBufferCopyFillScatter<float>/10/1_mean 25.1 24.7 26702424
BM_StridedLinearBufferCopyFillScatter<float>/10/2_mean 25.3 25.6 27833392
BM_StridedLinearBufferCopyFillScatter<float>/10/3_mean 25.6 25.3 25918611
BM_StridedLinearBufferCopyFillScatter<float>/10/4_mean 25.7 25.4 26326241
BM_StridedLinearBufferCopyFillScatter<float>/10/5_mean 26.6 26.8 25666331
BM_StridedLinearBufferCopyFillScatter<float>/10/6_mean 27.1 27.3 26045742
BM_StridedLinearBufferCopyFillScatter<float>/10/7_mean 27.8 27.5 24600928
BM_StridedLinearBufferCopyFillScatter<double>/10/0_mean 12.6 12.5 55896468
BM_StridedLinearBufferCopyFillScatter<double>/10/1_mean 13.0 12.9 53489777
BM_StridedLinearBufferCopyFillScatter<double>/10/2_mean 13.7 13.7 52233089
BM_StridedLinearBufferCopyFillScatter<double>/10/3_mean 13.7 13.7 53468665
BM_StridedLinearBufferCopyGather<float>/10/0_mean 19.4 19.6 36365543
BM_StridedLinearBufferCopyGather<float>/10/1_mean 18.4 18.1 35718133
BM_StridedLinearBufferCopyGather<float>/10/2_mean 18.6 18.4 37917347
BM_StridedLinearBufferCopyGather<float>/10/3_mean 18.8 18.7 35731989
BM_StridedLinearBufferCopyGather<float>/10/4_mean 19.2 19.0 37200322
BM_StridedLinearBufferCopyGather<float>/10/5_mean 19.7 19.5 34028747
BM_StridedLinearBufferCopyGather<float>/10/6_mean 20.3 20.2 33193722
BM_StridedLinearBufferCopyGather<float>/10/7_mean 20.4 20.3 34032379
BM_StridedLinearBufferCopyGather<double>/10/0_mean 12.0 11.8 52511524
BM_StridedLinearBufferCopyGather<double>/10/1_mean 11.5 11.5 60765169
BM_StridedLinearBufferCopyGather<double>/10/2_mean 11.8 11.8 58332651
BM_StridedLinearBufferCopyGather<double>/10/3_mean 12.5 12.7 54688558
And the result of pprof --list=StridedLinearBufferCopy /tmp/tensor_block_benchmark_test.prof:
. . 1030: template <typename StridedLinearBufferCopy::Kind kind>
. . 1031: static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
. . 1032: const Src& src,
. . 1033: const size_t count) {
. 49.51s 1034: Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
. . 1035: src.data);
. . 1036: }
. . 1037:
. . 1038: private:
. . 1039: template <typename StridedLinearBufferCopy::Kind kind>
. . 1040: static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
. . 1041: const IndexType count, const IndexType dst_offset,
. . 1042: const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
. . 1043: const IndexType src_offset, const IndexType src_stride,
. . 1044: const Scalar* EIGEN_RESTRICT src_data) {
. . 1045: const Scalar* src = &src_data[src_offset];
. . 1046: Scalar* dst = &dst_data[dst_offset];
. . 1047:
. . 1048: if (!Vectorizable) {
. . 1049: for (Index i = 0; i < count; ++i) {
. . 1050: dst[i * dst_stride] = src[i * src_stride];
. . 1051: }
. . 1052: return;
. . 1053: }
. . 1054:
. . 1055: const IndexType vectorized_size = count - PacketSize;
. . 1056: IndexType i = 0;
. . 1057:
. . 1058: if (kind == StridedLinearBufferCopy::Kind::Linear) {
. . 1059: // ******************************************************************** //
. . 1060: // Linear copy from `src` to `dst`.
. . 1061: const IndexType unrolled_size = count - 4 * PacketSize;
. . 1062: eigen_assert(src_stride == 1 && dst_stride == 1);
710ms 710ms 1063: for (; i <= unrolled_size; i += 4 * PacketSize) {
. . 1064: for (int j = 0; j < 4; ++j) {
. 2.06s 1065: Packet p = ploadu<Packet>(src + i + j * PacketSize);
. 590ms 1066: pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
. . 1067: }
. . 1068: }
1.28s 1.28s 1069: for (; i <= vectorized_size; i += PacketSize) {
. 60ms 1070: Packet p = ploadu<Packet>(src + i);
. 420ms 1071: pstoreu<Scalar, Packet>(dst + i, p);
. . 1072: }
1.36s 1.36s 1073: for (; i < count; ++i) {
1.10s 1.10s 1074: dst[i] = src[i];
. . 1075: }
. . 1076: // ******************************************************************** //
. . 1077: } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
. . 1078: // Scatter from `src` to `dst`.
. . 1079: eigen_assert(src_stride == 1 && dst_stride != 1);
1.04s 1.04s 1080: for (; i <= vectorized_size; i += PacketSize) {
. . 1081: Packet p = ploadu<Packet>(src + i);
. 9.37s 1082: pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
. . 1083: }
700ms 700ms 1084: for (; i < count; ++i) {
60ms 60ms 1085: dst[i * dst_stride] = src[i];
. . 1086: }
. . 1087: // ******************************************************************** //
. . 1088: } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
. . 1089: // Fill `dst` with value at `*src`.
. . 1090: eigen_assert(src_stride == 0 && dst_stride == 1);
. . 1091: const IndexType unrolled_size = count - 4 * PacketSize;
. 720ms 1092: Packet p = pload1<Packet>(src);
780ms 780ms 1093: for (; i <= unrolled_size; i += 4 * PacketSize) {
. . 1094: for (int j = 0; j < 4; ++j) {
. 2.34s 1095: pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
. . 1096: }
. . 1097: }
750ms 750ms 1098: for (; i <= vectorized_size; i += PacketSize) {
. 430ms 1099: pstoreu<Scalar, Packet>(dst + i, p);
. . 1100: }
1.58s 1.58s 1101: for (; i < count; ++i) {
1.38s 1.38s 1102: dst[i] = *src;
. . 1103: }
. . 1104: // ******************************************************************** //
. . 1105: } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
. . 1106: // Scatter `*src` into `dst`.
. . 1107: eigen_assert(src_stride == 0 && dst_stride != 1);
. 150ms 1108: Packet p = pload1<Packet>(src);
930ms 930ms 1109: for (; i <= vectorized_size; i += PacketSize) {
. 8.87s 1110: pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
. . 1111: }
590ms 590ms 1112: for (; i < count; ++i) {
450ms 450ms 1113: dst[i * dst_stride] = *src;
. . 1114: }
. . 1115: // ******************************************************************** //
. . 1116: } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
. . 1117: // Gather from `src` into `dst`.
. . 1118: eigen_assert(dst_stride == 1);
690ms 690ms 1119: for (; i <= vectorized_size; i += PacketSize) {
. 5.24s 1120: Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
. 3.11s 1121: pstoreu<Scalar, Packet>(dst + i, p);
. . 1122: }
1.15s 1.15s 1123: for (; i < count; ++i) {
10ms 10ms 1124: dst[i] = src[i * src_stride];
. . 1125: }
. . 1126: // ******************************************************************** //
. . 1127: } else if (kind == StridedLinearBufferCopy::Kind::Random) {
. . 1128: // Random.
. . 1129: for (; i < count; ++i) {
And these are the results with this MR:
Run on gonnet.zrh (48 X 2594 MHz CPUs); 2023-04-20T01:48:05.813475259-07:00
CPU: Intel Haswell with HyperThreading (24 cores) dL1:32KB dL2:256KB dL3:30MB
Benchmark Time(ns) CPU(ns) Iterations
--------------------------------------------------------------------------------------------------
BM_StridedLinearBufferCopyLinear<float>/10/0_mean 4.70 4.80 123291440
BM_StridedLinearBufferCopyLinear<float>/10/1_mean 6.28 6.33 107396720
BM_StridedLinearBufferCopyLinear<float>/10/2_mean 7.24 7.27 87505633
BM_StridedLinearBufferCopyLinear<float>/10/3_mean 7.33 7.27 87495105
BM_StridedLinearBufferCopyLinear<float>/10/4_mean 6.16 6.15 107382784
BM_StridedLinearBufferCopyLinear<float>/10/5_mean 5.60 5.62 121796808
BM_StridedLinearBufferCopyLinear<float>/10/6_mean 6.39 6.36 93073315
BM_StridedLinearBufferCopyLinear<float>/10/7_mean 6.76 6.72 92914680
BM_StridedLinearBufferCopyLinear<double>/10/0_mean 4.73 4.80 129885590
BM_StridedLinearBufferCopyLinear<double>/10/1_mean 6.51 6.56 102967312
BM_StridedLinearBufferCopyLinear<double>/10/2_mean 6.12 6.12 109194717
BM_StridedLinearBufferCopyLinear<double>/10/3_mean 5.74 5.71 119825492
BM_StridedLinearBufferCopyScatter<float>/10/0_mean 25.0 25.2 27810621
BM_StridedLinearBufferCopyScatter<float>/10/1_mean 25.1 25.4 28188447
BM_StridedLinearBufferCopyScatter<float>/10/2_mean 25.3 25.1 27058978
BM_StridedLinearBufferCopyScatter<float>/10/3_mean 25.6 25.5 27466771
BM_StridedLinearBufferCopyScatter<float>/10/4_mean 25.9 26.0 27431745
BM_StridedLinearBufferCopyScatter<float>/10/5_mean 26.7 26.7 26044859
BM_StridedLinearBufferCopyScatter<float>/10/6_mean 27.2 27.1 25664379
BM_StridedLinearBufferCopyScatter<float>/10/7_mean 27.4 27.5 25758283
BM_StridedLinearBufferCopyScatter<double>/10/0_mean 12.8 12.8 53473872
BM_StridedLinearBufferCopyScatter<double>/10/1_mean 13.0 13.0 57118077
BM_StridedLinearBufferCopyScatter<double>/10/2_mean 13.1 13.2 55880641
BM_StridedLinearBufferCopyScatter<double>/10/3_mean 13.9 13.9 52546924
BM_StridedLinearBufferCopyFillLinear<float>/10/0_mean 3.89 3.90 172241583
BM_StridedLinearBufferCopyFillLinear<float>/10/1_mean 4.77 4.77 120000000
BM_StridedLinearBufferCopyFillLinear<float>/10/2_mean 5.10 5.10 130905186
BM_StridedLinearBufferCopyFillLinear<float>/10/3_mean 5.39 5.40 123085108
BM_StridedLinearBufferCopyFillLinear<float>/10/4_mean 3.95 3.97 174488581
BM_StridedLinearBufferCopyFillLinear<float>/10/5_mean 6.53 6.53 94337606
BM_StridedLinearBufferCopyFillLinear<float>/10/6_mean 5.85 5.84 114341069
BM_StridedLinearBufferCopyFillLinear<float>/10/7_mean 6.18 6.20 90360074
BM_StridedLinearBufferCopyFillLinear<double>/10/0_mean 3.69 3.69 187667735
BM_StridedLinearBufferCopyFillLinear<double>/10/1_mean 4.75 4.75 144057221
BM_StridedLinearBufferCopyFillLinear<double>/10/2_mean 3.94 3.96 175955939
BM_StridedLinearBufferCopyFillLinear<double>/10/3_mean 6.10 6.09 103702513
BM_StridedLinearBufferCopyFillScatter<float>/10/0_mean 24.8 24.9 28220207
BM_StridedLinearBufferCopyFillScatter<float>/10/1_mean 25.1 25.0 27079599
BM_StridedLinearBufferCopyFillScatter<float>/10/2_mean 25.2 25.2 27018279
BM_StridedLinearBufferCopyFillScatter<float>/10/3_mean 25.9 26.1 27855702
BM_StridedLinearBufferCopyFillScatter<float>/10/4_mean 26.1 25.8 26322048
BM_StridedLinearBufferCopyFillScatter<float>/10/5_mean 26.3 26.2 27013995
BM_StridedLinearBufferCopyFillScatter<float>/10/6_mean 26.7 27.0 27437384
BM_StridedLinearBufferCopyFillScatter<float>/10/7_mean 27.0 26.9 26318163
BM_StridedLinearBufferCopyFillScatter<double>/10/0_mean 12.4 12.4 54689179
BM_StridedLinearBufferCopyFillScatter<double>/10/1_mean 14.7 14.7 49826154
BM_StridedLinearBufferCopyFillScatter<double>/10/2_mean 14.7 14.7 47616820
BM_StridedLinearBufferCopyFillScatter<double>/10/3_mean 13.5 13.4 54684977
BM_StridedLinearBufferCopyGather<float>/10/0_mean 18.6 18.5 38647747
BM_StridedLinearBufferCopyGather<float>/10/1_mean 18.8 18.8 37839340
BM_StridedLinearBufferCopyGather<float>/10/2_mean 18.9 18.9 37054690
BM_StridedLinearBufferCopyGather<float>/10/3_mean 19.6 19.1 35355901
BM_StridedLinearBufferCopyGather<float>/10/4_mean 19.4 19.4 37187867
BM_StridedLinearBufferCopyGather<float>/10/5_mean 19.7 19.8 35021213
BM_StridedLinearBufferCopyGather<float>/10/6_mean 19.9 20.0 34520624
BM_StridedLinearBufferCopyGather<float>/10/7_mean 20.2 20.5 35247986
BM_StridedLinearBufferCopyGather<double>/10/0_mean 11.0 11.0 59792320
BM_StridedLinearBufferCopyGather<double>/10/1_mean 12.0 12.1 58329290
BM_StridedLinearBufferCopyGather<double>/10/2_mean 11.7 11.7 63199313
BM_StridedLinearBufferCopyGather<double>/10/3_mean 13.0 13.2 55901534
And the result of pprof --list=StridedLinearBufferCopy /tmp/tensor_block_benchmark_test.prof:
. . 1034: template <typename StridedLinearBufferCopy::Kind kind>
. . 1035: static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
. . 1036: const Src& src,
. . 1037: const size_t count) {
. 47.43s 1038: Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
. . 1039: src.data);
. . 1040: }
. . 1041:
. . 1042: private:
. . 1043: template <typename StridedLinearBufferCopy::Kind kind>
. . 1044: static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
. . 1045: const IndexType count, const IndexType dst_offset,
. . 1046: const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
. . 1047: const IndexType src_offset, const IndexType src_stride,
. . 1048: const Scalar* EIGEN_RESTRICT src_data) {
. . 1049: const Scalar* src = &src_data[src_offset];
. . 1050: Scalar* dst = &dst_data[dst_offset];
. . 1051:
. . 1052: if (!Vectorizable) {
. . 1053: for (Index i = 0; i < count; ++i) {
. . 1054: dst[i * dst_stride] = src[i * src_stride];
. . 1055: }
. . 1056: return;
. . 1057: }
. . 1058:
. . 1059: const IndexType vectorized_size = count - PacketSize;
. . 1060: IndexType i = 0;
. . 1061:
. . 1062: if (kind == StridedLinearBufferCopy::Kind::Linear) {
. . 1063: // ******************************************************************** //
. . 1064: // Linear copy from `src` to `dst`.
. . 1065: const IndexType unrolled_size = count - 4 * PacketSize;
. . 1066: eigen_assert(src_stride == 1 && dst_stride == 1);
570ms 570ms 1067: for (; i <= unrolled_size; i += 4 * PacketSize) {
. . 1068: for (int j = 0; j < 4; ++j) {
. 1.72s 1069: Packet p = ploadu<Packet>(src + i + j * PacketSize);
. 880ms 1070: pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
. . 1071: }
. . 1072: }
1.21s 1.21s 1073: for (; i <= vectorized_size; i += PacketSize) {
. 90ms 1074: Packet p = ploadu<Packet>(src + i);
. 390ms 1075: pstoreu<Scalar, Packet>(dst + i, p);
. . 1076: }
. . 1077: if (HasHalfPacket) {
. . 1078: const IndexType vectorized_half_size = count - HalfPacketSize;
380ms 380ms 1079: for (; i <= vectorized_half_size; i += HalfPacketSize) {
. 90ms 1080: HalfPacket p = ploadu<HalfPacket>(src + i);
. 10ms 1081: pstoreu<Scalar, HalfPacket>(dst + i, p);
. . 1082: }
. . 1083: }
690ms 690ms 1084: for (; i < count; ++i) {
270ms 270ms 1085: dst[i] = src[i];
. . 1086: }
. . 1087: // ******************************************************************** //
. . 1088: } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
. . 1089: // Scatter from `src` to `dst`.
. . 1090: eigen_assert(src_stride == 1 && dst_stride != 1);
950ms 950ms 1091: for (; i <= vectorized_size; i += PacketSize) {
. . 1092: Packet p = ploadu<Packet>(src + i);
. 9.12s 1093: pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
. . 1094: }
. . 1095: if (HasHalfPacket) {
. . 1096: const IndexType vectorized_half_size = count - HalfPacketSize;
300ms 300ms 1097: for (; i <= vectorized_half_size; i += HalfPacketSize) {
. . 1098: HalfPacket p = ploadu<HalfPacket>(src + i);
. 100ms 1099: pscatter<Scalar, HalfPacket>(dst + i * dst_stride, p, dst_stride);
. . 1100: }
. . 1101: }
570ms 570ms 1102: for (; i < count; ++i) {
60ms 60ms 1103: dst[i * dst_stride] = src[i];
. . 1104: }
. . 1105: // ******************************************************************** //
. . 1106: } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
. . 1107: // Fill `dst` with value at `*src`.
. . 1108: eigen_assert(src_stride == 0 && dst_stride == 1);
. . 1109: const IndexType unrolled_size = count - 4 * PacketSize;
. . 1110: Scalar s = *src;
. 770ms 1111: Packet p = pset1<Packet>(s);
820ms 820ms 1112: for (; i <= unrolled_size; i += 4 * PacketSize) {
. . 1113: for (int j = 0; j < 4; ++j) {
. 2.38s 1114: pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
. . 1115: }
. . 1116: }
900ms 900ms 1117: for (; i <= vectorized_size; i += PacketSize) {
. 650ms 1118: pstoreu<Scalar, Packet>(dst + i, p);
. . 1119: }
. . 1120: if (HasHalfPacket) {
. . 1121: const IndexType vectorized_half_size = count - HalfPacketSize;
. 80ms 1122: HalfPacket hp = pset1<HalfPacket>(s);
680ms 680ms 1123: for (; i <= vectorized_half_size; i += HalfPacketSize) {
. 30ms 1124: pstoreu<Scalar, HalfPacket>(dst + i, hp);
. . 1125: }
. . 1126: }
630ms 630ms 1127: for (; i < count; ++i) {
470ms 470ms 1128: dst[i] = s;
. . 1129: }
. . 1130: // ******************************************************************** //
. . 1131: } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
. . 1132: // Scatter `*src` into `dst`.
. . 1133: eigen_assert(src_stride == 0 && dst_stride != 1);
90ms 90ms 1134: Scalar s = *src;
. . 1135: Packet p = pset1<Packet>(s);
1.09s 1.09s 1136: for (; i <= vectorized_size; i += PacketSize) {
. 8.89s 1137: pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
. . 1138: }
. . 1139: if (HasHalfPacket) {
. . 1140: const IndexType vectorized_half_size = count - HalfPacketSize;
. . 1141: HalfPacket hp = pset1<HalfPacket>(s);
200ms 200ms 1142: for (; i <= vectorized_half_size; i += HalfPacketSize) {
. 170ms 1143: pscatter<Scalar, HalfPacket>(dst + i * dst_stride, hp, dst_stride);
. . 1144: }
. . 1145: }
480ms 480ms 1146: for (; i < count; ++i) {
60ms 60ms 1147: dst[i * dst_stride] = s;
. . 1148: }
. . 1149: // ******************************************************************** //
. . 1150: } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
. . 1151: // Gather from `src` into `dst`.
. . 1152: eigen_assert(dst_stride == 1);
750ms 750ms 1153: for (; i <= vectorized_size; i += PacketSize) {
. 5.68s 1154: Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
. 2.66s 1155: pstoreu<Scalar, Packet>(dst + i, p);
. . 1156: }
. . 1157: if (HasHalfPacket) {
. . 1158: const IndexType vectorized_half_size = count - HalfPacketSize;
150ms 150ms 1159: for (; i <= vectorized_half_size; i += HalfPacketSize) {
. . 1160: HalfPacket p =
. 90ms 1161: pgather<Scalar, HalfPacket>(src + i * src_stride, src_stride);
. 50ms 1162: pstoreu<Scalar, HalfPacket>(dst + i, p);
. . 1163: }
. . 1164: }
660ms 660ms 1165: for (; i < count; ++i) {
170ms 170ms 1166: dst[i] = src[i * src_stride];
. . 1167: }
. . 1168: // ******************************************************************** //
. . 1169: } else if (kind == StridedLinearBufferCopy::Kind::Random) {
. . 1170: // Random.
. . 1171: for (; i < count; ++i) {
Note that the absolute ms in the pprof listings are not comparable since the benchmarking suite will run each test until a minimum number of ms have been reached, and not for a fixed number of iterations.