Avoid atomic false sharing in RunQueue
Add EIGEN_ALIGN_TO_AVOID_FALSE_SHARING macro to align data to a cache line boundary.
Benchmark:
// Thread pool benchmark:
//
// 1. Thread pool of size `n_threads`
// 2. Main thread launches `n0` tasks into the thread pool.
// 3. Each tasks launches `n1` tasks into the thread pool.
//
// Overall we end up running `n0 * n1` tasks in the thread pool in each
// benchmark iterations. We use `n1` separate atomic counters for inner tasks
// to minimize races between separate tasks. Benchmark interation waits for the
// completion of all `n0 * n1` tasks.
template <size_t n_threads, size_t n0, size_t n1>
static void BM_ThreadPool(benchmark::State& state) {
Eigen::ThreadPool pool(n_threads);
struct Counter {
alignas(std::hardware_destructive_interference_size)
std::atomic<int32_t> counter;
};
std::array<Counter, n1> n1_cnt;
for (int i = 0; i < n1; ++i) n1_cnt[i].counter.store(0);
for (auto _ : state) {
Eigen::Barrier barrier(n1);
for (int i = 0; i < n0; ++i) {
for (int j = 0; j < n1; ++j) {
pool.Schedule([i, j, &n1_cnt, &barrier]() {
auto cnt = n1_cnt[j].counter.fetch_add(1, std::memory_order_relaxed);
if ((cnt + 1) % n0 == 0) barrier.Notify();
});
}
}
barrier.Wait();
}
}
BENCHMARK(BM_ThreadPool</*n_threads=*/8, /*n0=*/10, /*n1=*/10>);
BENCHMARK(BM_ThreadPool</*n_threads=*/8, /*n0=*/10, /*n1=*/100>);
BENCHMARK(BM_ThreadPool</*n_threads=*/8, /*n0=*/10, /*n1=*/1000>);
BENCHMARK(BM_ThreadPool</*n_threads=*/16, /*n0=*/10, /*n1=*/10>);
BENCHMARK(BM_ThreadPool</*n_threads=*/16, /*n0=*/10, /*n1=*/100>);
BENCHMARK(BM_ThreadPool</*n_threads=*/16, /*n0=*/10, /*n1=*/1000>);
BENCHMARK(BM_ThreadPool</*n_threads=*/32, /*n0=*/10, /*n1=*/10>);
BENCHMARK(BM_ThreadPool</*n_threads=*/32, /*n0=*/10, /*n1=*/100>);
BENCHMARK(BM_ThreadPool</*n_threads=*/32, /*n0=*/10, /*n1=*/1000>);
BENCHMARK(BM_ThreadPool</*n_threads=*/64, /*n0=*/10, /*n1=*/10>);
BENCHMARK(BM_ThreadPool</*n_threads=*/64, /*n0=*/10, /*n1=*/100>);
BENCHMARK(BM_ThreadPool</*n_threads=*/64, /*n0=*/10, /*n1=*/1000>);
Benchmark results:
AMD Rome (Zen2) CPU
name old cpu/op new cpu/op delta
BM_ThreadPool< 8, 10, 10> 1.12ms ± 6% 1.11ms ± 6% ~ (p=0.095 n=118+119)
BM_ThreadPool< 8, 10, 100> 11.2ms ± 6% 11.2ms ± 6% ~ (p=0.552 n=117+117)
BM_ThreadPool< 8, 10, 1000> 112ms ± 5% 112ms ± 8% ~ (p=0.768 n=119+119)
BM_ThreadPool< 16, 10, 10> 1.10ms ± 7% 1.09ms ± 9% -0.83% (p=0.026 n=119+120)
BM_ThreadPool< 16, 10, 100> 11.0ms ± 8% 11.0ms ± 8% ~ (p=0.486 n=120+117)
BM_ThreadPool< 16, 10, 1000> 111ms ± 7% 111ms ± 7% ~ (p=0.838 n=118+118)
BM_ThreadPool< 32, 10, 10> 1.11ms ± 8% 1.11ms ± 9% ~ (p=0.262 n=119+119)
BM_ThreadPool< 32, 10, 100> 11.2ms ± 9% 11.1ms ± 9% ~ (p=0.413 n=120+120)
BM_ThreadPool< 32, 10, 1000> 111ms ± 8% 110ms ± 9% ~ (p=0.056 n=120+120)
BM_ThreadPool< 64, 10, 10> 1.14ms ± 9% 1.14ms ± 9% ~ (p=0.247 n=119+118)
BM_ThreadPool< 64, 10, 100> 11.5ms ± 9% 11.3ms ± 9% -1.85% (p=0.000 n=120+120)
BM_ThreadPool< 64, 10, 1000> 114ms ±11% 113ms ± 8% -1.09% (p=0.002 n=120+114)
name old time/op new time/op delta
BM_ThreadPool< 8, 10, 10> 314µs ± 7% 311µs ± 8% ~ (p=0.054 n=118+119)
BM_ThreadPool< 8, 10, 100> 3.14ms ± 8% 3.13ms ± 8% ~ (p=0.498 n=119+119)
BM_ThreadPool< 8, 10, 1000> 31.4ms ± 5% 31.2ms ± 9% ~ (p=0.431 n=118+117)
BM_ThreadPool< 16, 10, 10> 307µs ±10% 304µs ±12% -1.06% (p=0.042 n=120+120)
BM_ThreadPool< 16, 10, 100> 3.06ms ±12% 3.04ms ±11% ~ (p=0.142 n=120+119)
BM_ThreadPool< 16, 10, 1000> 30.8ms ±10% 30.7ms ±10% ~ (p=0.465 n=120+117)
BM_ThreadPool< 32, 10, 10> 307µs ±13% 304µs ±14% ~ (p=0.244 n=119+120)
BM_ThreadPool< 32, 10, 100> 3.08ms ±12% 3.05ms ±12% ~ (p=0.146 n=120+120)
BM_ThreadPool< 32, 10, 1000> 30.6ms ±11% 30.1ms ±12% -1.62% (p=0.026 n=120+120)
BM_ThreadPool< 64, 10, 10> 311µs ±13% 309µs ±14% ~ (p=0.331 n=119+120)
BM_ThreadPool< 64, 10, 100> 3.14ms ±14% 3.05ms ±15% -2.84% (p=0.000 n=118+120)
BM_ThreadPool< 64, 10, 1000> 31.2ms ±15% 30.7ms ±13% -1.51% (p=0.006 n=120+120)
name old INSTRUCTIONS/op new INSTRUCTIONS/op delta
BM_ThreadPool< 8, 10, 10> 3.18M ± 7% 2.94M ± 8% -7.73% (p=0.000 n=119+118)
BM_ThreadPool< 8, 10, 100> 31.4M ± 8% 28.7M ± 9% -8.68% (p=0.000 n=118+120)
BM_ThreadPool< 8, 10, 1000> 316M ± 7% 291M ± 9% -7.67% (p=0.000 n=118+114)
BM_ThreadPool< 16, 10, 10> 3.22M ±10% 3.04M ±13% -5.75% (p=0.000 n=120+120)
BM_ThreadPool< 16, 10, 100> 31.7M ±12% 29.8M ±12% -5.97% (p=0.000 n=120+120)
BM_ThreadPool< 16, 10, 1000> 322M ±12% 303M ±12% -5.84% (p=0.000 n=120+118)
BM_ThreadPool< 32, 10, 10> 3.16M ±16% 3.02M ±12% -4.63% (p=0.000 n=119+119)
BM_ThreadPool< 32, 10, 100> 31.7M ±13% 30.0M ±13% -5.31% (p=0.000 n=120+119)
BM_ThreadPool< 32, 10, 1000> 314M ±13% 295M ±14% -5.90% (p=0.000 n=120+120)
BM_ThreadPool< 64, 10, 10> 3.61M ±13% 3.42M ±15% -5.29% (p=0.000 n=119+118)
BM_ThreadPool< 64, 10, 100> 36.2M ±17% 33.5M ±15% -7.38% (p=0.000 n=119+120)
BM_ThreadPool< 64, 10, 1000> 362M ±15% 337M ±14% -6.67% (p=0.000 n=118+120)
name old CYCLES/op new CYCLES/op delta
BM_ThreadPool< 8, 10, 10> 1.59M ± 7% 1.57M ± 9% -1.35% (p=0.000 n=117+120)
BM_ThreadPool< 8, 10, 100> 15.9M ± 7% 15.9M ± 8% ~ (p=0.242 n=116+119)
BM_ThreadPool< 8, 10, 1000> 159M ± 6% 158M ± 9% -0.93% (p=0.033 n=118+116)
BM_ThreadPool< 16, 10, 10> 1.59M ± 9% 1.55M ± 9% -2.90% (p=0.000 n=118+118)
BM_ThreadPool< 16, 10, 100> 15.9M ± 8% 15.6M ±10% -2.25% (p=0.000 n=115+117)
BM_ThreadPool< 16, 10, 1000> 160M ± 9% 157M ± 9% -1.94% (p=0.000 n=117+116)
BM_ThreadPool< 32, 10, 10> 1.63M ± 8% 1.58M ± 9% -2.98% (p=0.000 n=116+116)
BM_ThreadPool< 32, 10, 100> 16.4M ±11% 15.9M ±10% -2.92% (p=0.000 n=118+117)
BM_ThreadPool< 32, 10, 1000> 163M ±10% 157M ±12% -3.51% (p=0.000 n=120+120)
BM_ThreadPool< 64, 10, 10> 1.73M ± 8% 1.67M ± 9% -3.81% (p=0.000 n=112+117)
BM_ThreadPool< 64, 10, 100> 17.5M ± 9% 16.6M ±10% -4.94% (p=0.000 n=115+119)
BM_ThreadPool< 64, 10, 1000> 174M ±10% 166M ± 8% -4.58% (p=0.000 n=118+113)
Intel CascadeLake
name old cpu/op new cpu/op delta
BM_ThreadPool< 8, 10, 10> 670µs ±18% 659µs ±16% ~ (p=0.060 n=118+117)
BM_ThreadPool< 8, 10, 100> 6.64ms ±17% 6.46ms ±13% -2.78% (p=0.000 n=114+105)
BM_ThreadPool< 8, 10, 1000> 67.5ms ±20% 65.8ms ±19% -2.45% (p=0.003 n=120+116)
BM_ThreadPool< 16, 10, 10> 678µs ±23% 660µs ±16% -2.62% (p=0.004 n=120+115)
BM_ThreadPool< 16, 10, 100> 6.77ms ±19% 6.68ms ±17% ~ (p=0.095 n=119+119)
BM_ThreadPool< 16, 10, 1000> 67.8ms ±21% 64.7ms ±14% -4.66% (p=0.000 n=119+104)
BM_ThreadPool< 32, 10, 10> 684µs ±28% 661µs ±18% -3.47% (p=0.001 n=120+114)
BM_ThreadPool< 32, 10, 100> 6.85ms ±22% 6.62ms ±20% -3.31% (p=0.000 n=120+117)
BM_ThreadPool< 32, 10, 1000> 68.3ms ±24% 66.6ms ±23% -2.39% (p=0.005 n=120+118)
BM_ThreadPool< 64, 10, 10> 712µs ±39% 695µs ±21% ~ (p=0.102 n=120+119)
BM_ThreadPool< 64, 10, 100> 7.22ms ±41% 6.89ms ±27% -4.52% (p=0.039 n=120+114)
BM_ThreadPool< 64, 10, 1000> 69.9ms ±26% 70.4ms ±34% ~ (p=0.565 n=115+116)
name old time/op new time/op delta
BM_ThreadPool< 8, 10, 10> 174µs ± 6% 172µs ± 5% -0.89% (p=0.002 n=113+116)
BM_ThreadPool< 8, 10, 100> 1.75ms ± 8% 1.72ms ± 6% -1.56% (p=0.000 n=117+113)
BM_ThreadPool< 8, 10, 1000> 17.4ms ± 7% 17.1ms ± 5% -1.82% (p=0.000 n=116+112)
BM_ThreadPool< 16, 10, 10> 174µs ± 7% 172µs ± 6% -1.46% (p=0.000 n=116+115)
BM_ThreadPool< 16, 10, 100> 1.74ms ± 8% 1.72ms ± 6% -1.53% (p=0.000 n=118+115)
BM_ThreadPool< 16, 10, 1000> 17.4ms ± 6% 17.1ms ± 8% -1.49% (p=0.000 n=119+118)
BM_ThreadPool< 32, 10, 10> 171µs ± 8% 168µs ± 8% -1.67% (p=0.000 n=118+117)
BM_ThreadPool< 32, 10, 100> 1.70ms ± 9% 1.68ms ± 8% -1.67% (p=0.000 n=119+119)
BM_ThreadPool< 32, 10, 1000> 17.0ms ± 8% 16.7ms ± 5% -2.16% (p=0.000 n=119+113)
BM_ThreadPool< 64, 10, 10> 171µs ±14% 166µs ±13% -3.01% (p=0.000 n=119+115)
BM_ThreadPool< 64, 10, 100> 1.70ms ±15% 1.65ms ±12% -3.10% (p=0.000 n=117+114)
BM_ThreadPool< 64, 10, 1000> 16.9ms ±14% 16.5ms ±14% -2.47% (p=0.000 n=117+113)
name old INSTRUCTIONS/op new INSTRUCTIONS/op delta
BM_ThreadPool< 8, 10, 10> 1.59M ± 7% 1.28M ± 4% -19.24% (p=0.000 n=114+111)
BM_ThreadPool< 8, 10, 100> 15.0M ± 6% 12.8M ± 4% -14.14% (p=0.000 n=113+111)
BM_ThreadPool< 8, 10, 1000> 160M ± 8% 129M ± 4% -19.31% (p=0.000 n=112+112)
BM_ThreadPool< 16, 10, 10> 1.70M ± 6% 1.35M ± 4% -20.59% (p=0.000 n=109+108)
BM_ThreadPool< 16, 10, 100> 16.4M ± 5% 13.5M ± 4% -17.71% (p=0.000 n=103+105)
BM_ThreadPool< 16, 10, 1000> 172M ± 6% 135M ± 5% -21.39% (p=0.000 n=109+107)
BM_ThreadPool< 32, 10, 10> 1.68M ± 6% 1.39M ± 4% -16.93% (p=0.000 n=115+104)
BM_ThreadPool< 32, 10, 100> 16.7M ± 6% 13.9M ± 5% -16.75% (p=0.000 n=113+102)
BM_ThreadPool< 32, 10, 1000> 169M ± 7% 140M ± 5% -16.98% (p=0.000 n=111+104)
BM_ThreadPool< 64, 10, 10> 1.98M ±10% 1.63M ± 7% -17.62% (p=0.000 n=120+106)
BM_ThreadPool< 64, 10, 100> 19.7M ±10% 16.4M ± 8% -17.09% (p=0.000 n=111+112)
BM_ThreadPool< 64, 10, 1000> 198M ± 9% 163M ± 7% -17.50% (p=0.000 n=114+104)
name old CYCLES/op new CYCLES/op delta
BM_ThreadPool< 8, 10, 10> 1.05M ± 5% 1.06M ± 4% +0.42% (p=0.030 n=113+111)
BM_ThreadPool< 8, 10, 100> 10.5M ± 5% 10.6M ± 5% +1.18% (p=0.000 n=110+115)
BM_ThreadPool< 8, 10, 1000> 105M ± 5% 105M ± 5% ~ (p=0.064 n=115+112)
BM_ThreadPool< 16, 10, 10> 1.07M ± 4% 1.07M ± 4% ~ (p=0.795 n=109+112)
BM_ThreadPool< 16, 10, 100> 10.7M ± 6% 10.7M ± 5% ~ (p=0.273 n=113+114)
BM_ThreadPool< 16, 10, 1000> 107M ± 5% 106M ± 5% -0.49% (p=0.029 n=115+112)
BM_ThreadPool< 32, 10, 10> 1.10M ± 5% 1.09M ± 5% -0.65% (p=0.016 n=110+112)
BM_ThreadPool< 32, 10, 100> 11.0M ± 5% 10.9M ± 5% -1.10% (p=0.000 n=115+111)
BM_ThreadPool< 32, 10, 1000> 110M ± 6% 108M ± 6% -1.08% (p=0.000 n=116+110)
BM_ThreadPool< 64, 10, 10> 1.18M ± 5% 1.20M ± 4% +1.26% (p=0.000 n=106+109)
BM_ThreadPool< 64, 10, 100> 11.8M ± 5% 11.9M ± 4% +0.89% (p=0.000 n=103+108)
BM_ThreadPool< 64, 10, 1000> 118M ± 6% 119M ± 6% +0.66% (p=0.012 n=112+109)
Edited by Eugene Zhulenev