Enable packet segment in partial redux
Reference issue
What does this implement/fix?
Additional information
Ubuntu clang version 18.1.3 -DNDEBUG -O3 -mavx2
| Benchmark | Time | CPU | Iterations | |||
|---|---|---|---|---|---|---|
| test_segment_rowwise_control/1 | 1.12 | -8.93% | 1.12 | -8.93% | 623303252 | 8.73% |
| test_segment_rowwise_control/8 | 3.16 | -12.03% | 3.16 | -12.03% | 220053279 | 14.35% |
| test_segment_rowwise_control/64 | 72.3 | -1.24% | 72.2 | -1.11% | 9705963 | 1.11% |
| test_segment_rowwise_control/512 | 14123 | -0.57% | 14121 | -0.59% | 49681 | 0.36% |
| test_segment_rowwise_control/1024 | 245572 | -0.83% | 245503 | -0.82% | 2796 | 1.25% |
| test_segment_rowwise_1/1 | 1.13 | -9.73% | 1.13 | -9.73% | 624569605 | 9.44% |
| test_segment_rowwise_1/8 | 3.16 | -11.71% | 3.16 | -11.71% | 221901482 | 13.36% |
| test_segment_rowwise_1/64 | 72.3 | -1.38% | 72.3 | -1.38% | 9686812 | 0.65% |
| test_segment_rowwise_1/512 | 14104 | -0.50% | 14101 | -0.48% | 49521 | 0.64% |
| test_segment_rowwise_1/1024 | 245312 | -0.70% | 245256 | -0.68% | 2834 | -0.88% |
| test_segment_rowwise_half/1 | 4.5 | -42.00% | 4.5 | -42.00% | 156829481 | 70.60% |
| test_segment_rowwise_half/8 | 11.8 | -47.80% | 11.8 | -47.80% | 58620389 | 94.46% |
| test_segment_rowwise_half/64 | 120 | -22.08% | 120 | -22.08% | 5777833 | 29.63% |
| test_segment_rowwise_half/512 | 9379 | -4.62% | 9376 | -4.61% | 74755 | 4.94% |
| test_segment_rowwise_half/1024 | 64958 | -3.75% | 64956 | -3.77% | 10817 | 2.12% |
| test_segment_rowwise_m1/1 | 9.64 | -65.25% | 9.63 | -65.21% | 72592615 | 188.59% |
| test_segment_rowwise_m1/8 | 20 | -64.90% | 20 | -64.90% | 35250256 | 183.58% |
| test_segment_rowwise_m1/64 | 174 | -35.63% | 174 | -35.63% | 3999856 | 55.68% |
| test_segment_rowwise_m1/512 | 13056 | -6.36% | 13055 | -6.37% | 53592 | 7.75% |
| test_segment_rowwise_m1/1024 | 76313 | -2.59% | 76298 | -2.61% | 9102 | 1.90% |
#include <benchmark/benchmark.h>
#include <Eigen/Core>
using namespace Eigen;
using T = float;
using Mat = MatrixX<T>;
using Vec = VectorX<T>;
static void test_segment_rowwise_control(benchmark::State& state) {
constexpr Index PacketSize = internal::packet_traits<T>::size;
Index n = numext::round_down(state.range(0), PacketSize);
Mat A(n, n);
Vec b(n);
A.setRandom();
for (auto s : state) {
b = A.rowwise().sum();
benchmark::DoNotOptimize(A);
benchmark::DoNotOptimize(b);
}
}
static void test_segment_rowwise_1(benchmark::State& state) {
constexpr Index PacketSize = internal::packet_traits<T>::size;
Index n = numext::round_down(state.range(0), PacketSize);
Mat A(n, n);
Vec b(n);
A.setRandom();
for (auto s : state) {
b = A.rowwise().sum();
benchmark::DoNotOptimize(A);
benchmark::DoNotOptimize(b);
}
}
static void test_segment_rowwise_half(benchmark::State& state) {
constexpr Index PacketSize = internal::packet_traits<T>::size;
Index n = numext::round_down(state.range(0), PacketSize) + PacketSize / 2;
Mat A(n, n);
Vec b(n);
A.setRandom();
for (auto s : state) {
b = A.rowwise().sum();
benchmark::DoNotOptimize(A);
benchmark::DoNotOptimize(b);
}
}
static void test_segment_rowwise_m1(benchmark::State& state) {
constexpr Index PacketSize = internal::packet_traits<T>::size;
Index n = numext::round_down(state.range(0), PacketSize) + PacketSize - 1;
Mat A(n, n);
Vec b(n);
A.setRandom();
for (auto s : state) {
b = A.rowwise().sum();
benchmark::DoNotOptimize(A);
benchmark::DoNotOptimize(b);
}
}
BENCHMARK(test_segment_rowwise_control)->Range(1<<0, 1<<10);
BENCHMARK(test_segment_rowwise_1)->Range(1<<0, 1<<10);
BENCHMARK(test_segment_rowwise_half)->Range(1<<0, 1<<10);
BENCHMARK(test_segment_rowwise_m1)->Range(1<<0, 1<<10);
BENCHMARK_MAIN();
Edited by Charles Schlosser