Avoid atomic false sharing in RunQueue

Add EIGEN_ALIGN_TO_AVOID_FALSE_SHARING macro to align data to a cache line boundary.

Benchmark:

// Thread pool benchmark:
//
// 1. Thread pool of size `n_threads`
// 2. Main thread launches `n0` tasks into the thread pool.
// 3. Each tasks launches `n1` tasks into the thread pool.
//
// Overall we end up running `n0 * n1` tasks in the thread pool in each
// benchmark iterations. We use `n1` separate atomic counters for inner tasks
// to minimize races between separate tasks. Benchmark interation waits for the
// completion of all `n0 * n1` tasks.
template <size_t n_threads, size_t n0, size_t n1>
static void BM_ThreadPool(benchmark::State& state) {
  Eigen::ThreadPool pool(n_threads);

  struct Counter {
    alignas(std::hardware_destructive_interference_size)
        std::atomic<int32_t> counter;
  };

  std::array<Counter, n1> n1_cnt;
  for (int i = 0; i < n1; ++i) n1_cnt[i].counter.store(0);

  for (auto _ : state) {
    Eigen::Barrier barrier(n1);

    for (int i = 0; i < n0; ++i) {
      for (int j = 0; j < n1; ++j) {
        pool.Schedule([i, j, &n1_cnt, &barrier]() {
          auto cnt = n1_cnt[j].counter.fetch_add(1, std::memory_order_relaxed);
          if ((cnt + 1) % n0 == 0) barrier.Notify();
        });
      }
    }

    barrier.Wait();
  }
}

BENCHMARK(BM_ThreadPool</*n_threads=*/8, /*n0=*/10, /*n1=*/10>);
BENCHMARK(BM_ThreadPool</*n_threads=*/8, /*n0=*/10, /*n1=*/100>);
BENCHMARK(BM_ThreadPool</*n_threads=*/8, /*n0=*/10, /*n1=*/1000>);

BENCHMARK(BM_ThreadPool</*n_threads=*/16, /*n0=*/10, /*n1=*/10>);
BENCHMARK(BM_ThreadPool</*n_threads=*/16, /*n0=*/10, /*n1=*/100>);
BENCHMARK(BM_ThreadPool</*n_threads=*/16, /*n0=*/10, /*n1=*/1000>);

BENCHMARK(BM_ThreadPool</*n_threads=*/32, /*n0=*/10, /*n1=*/10>);
BENCHMARK(BM_ThreadPool</*n_threads=*/32, /*n0=*/10, /*n1=*/100>);
BENCHMARK(BM_ThreadPool</*n_threads=*/32, /*n0=*/10, /*n1=*/1000>);

BENCHMARK(BM_ThreadPool</*n_threads=*/64, /*n0=*/10, /*n1=*/10>);
BENCHMARK(BM_ThreadPool</*n_threads=*/64, /*n0=*/10, /*n1=*/100>);
BENCHMARK(BM_ThreadPool</*n_threads=*/64, /*n0=*/10, /*n1=*/1000>);

Benchmark results:

AMD Rome (Zen2) CPU

name                           old cpu/op   new cpu/op   delta
BM_ThreadPool< 8, 10, 10>      1.12ms ± 6%  1.11ms ± 6%    ~     (p=0.095 n=118+119)
BM_ThreadPool< 8, 10, 100>     11.2ms ± 6%  11.2ms ± 6%    ~     (p=0.552 n=117+117)
BM_ThreadPool< 8, 10, 1000>     112ms ± 5%   112ms ± 8%    ~     (p=0.768 n=119+119)
BM_ThreadPool< 16, 10, 10>     1.10ms ± 7%  1.09ms ± 9%  -0.83%  (p=0.026 n=119+120)
BM_ThreadPool< 16, 10, 100>    11.0ms ± 8%  11.0ms ± 8%    ~     (p=0.486 n=120+117)
BM_ThreadPool< 16, 10, 1000>    111ms ± 7%   111ms ± 7%    ~     (p=0.838 n=118+118)
BM_ThreadPool< 32, 10, 10>     1.11ms ± 8%  1.11ms ± 9%    ~     (p=0.262 n=119+119)
BM_ThreadPool< 32, 10, 100>    11.2ms ± 9%  11.1ms ± 9%    ~     (p=0.413 n=120+120)
BM_ThreadPool< 32, 10, 1000>    111ms ± 8%   110ms ± 9%    ~     (p=0.056 n=120+120)
BM_ThreadPool< 64, 10, 10>     1.14ms ± 9%  1.14ms ± 9%    ~     (p=0.247 n=119+118)
BM_ThreadPool< 64, 10, 100>    11.5ms ± 9%  11.3ms ± 9%  -1.85%  (p=0.000 n=120+120)
BM_ThreadPool< 64, 10, 1000>    114ms ±11%   113ms ± 8%  -1.09%  (p=0.002 n=120+114)

name                           old time/op             new time/op             delta
BM_ThreadPool< 8, 10, 10>       314µs ± 7%              311µs ± 8%      ~         (p=0.054 n=118+119)
BM_ThreadPool< 8, 10, 100>     3.14ms ± 8%             3.13ms ± 8%      ~         (p=0.498 n=119+119)
BM_ThreadPool< 8, 10, 1000>    31.4ms ± 5%             31.2ms ± 9%      ~         (p=0.431 n=118+117)
BM_ThreadPool< 16, 10, 10>      307µs ±10%              304µs ±12%    -1.06%      (p=0.042 n=120+120)
BM_ThreadPool< 16, 10, 100>    3.06ms ±12%             3.04ms ±11%      ~         (p=0.142 n=120+119)
BM_ThreadPool< 16, 10, 1000>   30.8ms ±10%             30.7ms ±10%      ~         (p=0.465 n=120+117)
BM_ThreadPool< 32, 10, 10>      307µs ±13%              304µs ±14%      ~         (p=0.244 n=119+120)
BM_ThreadPool< 32, 10, 100>    3.08ms ±12%             3.05ms ±12%      ~         (p=0.146 n=120+120)
BM_ThreadPool< 32, 10, 1000>   30.6ms ±11%             30.1ms ±12%    -1.62%      (p=0.026 n=120+120)
BM_ThreadPool< 64, 10, 10>      311µs ±13%              309µs ±14%      ~         (p=0.331 n=119+120)
BM_ThreadPool< 64, 10, 100>    3.14ms ±14%             3.05ms ±15%    -2.84%      (p=0.000 n=118+120)
BM_ThreadPool< 64, 10, 1000>   31.2ms ±15%             30.7ms ±13%    -1.51%      (p=0.006 n=120+120)

name                           old INSTRUCTIONS/op     new INSTRUCTIONS/op     delta
BM_ThreadPool< 8, 10, 10>       3.18M ± 7%              2.94M ± 8%    -7.73%      (p=0.000 n=119+118)
BM_ThreadPool< 8, 10, 100>      31.4M ± 8%              28.7M ± 9%    -8.68%      (p=0.000 n=118+120)
BM_ThreadPool< 8, 10, 1000>      316M ± 7%               291M ± 9%    -7.67%      (p=0.000 n=118+114)
BM_ThreadPool< 16, 10, 10>      3.22M ±10%              3.04M ±13%    -5.75%      (p=0.000 n=120+120)
BM_ThreadPool< 16, 10, 100>     31.7M ±12%              29.8M ±12%    -5.97%      (p=0.000 n=120+120)
BM_ThreadPool< 16, 10, 1000>     322M ±12%               303M ±12%    -5.84%      (p=0.000 n=120+118)
BM_ThreadPool< 32, 10, 10>      3.16M ±16%              3.02M ±12%    -4.63%      (p=0.000 n=119+119)
BM_ThreadPool< 32, 10, 100>     31.7M ±13%              30.0M ±13%    -5.31%      (p=0.000 n=120+119)
BM_ThreadPool< 32, 10, 1000>     314M ±13%               295M ±14%    -5.90%      (p=0.000 n=120+120)
BM_ThreadPool< 64, 10, 10>      3.61M ±13%              3.42M ±15%    -5.29%      (p=0.000 n=119+118)
BM_ThreadPool< 64, 10, 100>     36.2M ±17%              33.5M ±15%    -7.38%      (p=0.000 n=119+120)
BM_ThreadPool< 64, 10, 1000>     362M ±15%               337M ±14%    -6.67%      (p=0.000 n=118+120)

name                           old CYCLES/op           new CYCLES/op           delta
BM_ThreadPool< 8, 10, 10>       1.59M ± 7%              1.57M ± 9%    -1.35%      (p=0.000 n=117+120)
BM_ThreadPool< 8, 10, 100>      15.9M ± 7%              15.9M ± 8%      ~         (p=0.242 n=116+119)
BM_ThreadPool< 8, 10, 1000>      159M ± 6%               158M ± 9%    -0.93%      (p=0.033 n=118+116)
BM_ThreadPool< 16, 10, 10>      1.59M ± 9%              1.55M ± 9%    -2.90%      (p=0.000 n=118+118)
BM_ThreadPool< 16, 10, 100>     15.9M ± 8%              15.6M ±10%    -2.25%      (p=0.000 n=115+117)
BM_ThreadPool< 16, 10, 1000>     160M ± 9%               157M ± 9%    -1.94%      (p=0.000 n=117+116)
BM_ThreadPool< 32, 10, 10>      1.63M ± 8%              1.58M ± 9%    -2.98%      (p=0.000 n=116+116)
BM_ThreadPool< 32, 10, 100>     16.4M ±11%              15.9M ±10%    -2.92%      (p=0.000 n=118+117)
BM_ThreadPool< 32, 10, 1000>     163M ±10%               157M ±12%    -3.51%      (p=0.000 n=120+120)
BM_ThreadPool< 64, 10, 10>      1.73M ± 8%              1.67M ± 9%    -3.81%      (p=0.000 n=112+117)
BM_ThreadPool< 64, 10, 100>     17.5M ± 9%              16.6M ±10%    -4.94%      (p=0.000 n=115+119)
BM_ThreadPool< 64, 10, 1000>     174M ±10%               166M ± 8%    -4.58%      (p=0.000 n=118+113)

Intel CascadeLake

name                          old cpu/op   new cpu/op   delta
BM_ThreadPool< 8, 10, 10>      670µs ±18%   659µs ±16%    ~     (p=0.060 n=118+117)
BM_ThreadPool< 8, 10, 100>    6.64ms ±17%  6.46ms ±13%  -2.78%  (p=0.000 n=114+105)
BM_ThreadPool< 8, 10, 1000>   67.5ms ±20%  65.8ms ±19%  -2.45%  (p=0.003 n=120+116)
BM_ThreadPool< 16, 10, 10>     678µs ±23%   660µs ±16%  -2.62%  (p=0.004 n=120+115)
BM_ThreadPool< 16, 10, 100>   6.77ms ±19%  6.68ms ±17%    ~     (p=0.095 n=119+119)
BM_ThreadPool< 16, 10, 1000>  67.8ms ±21%  64.7ms ±14%  -4.66%  (p=0.000 n=119+104)
BM_ThreadPool< 32, 10, 10>     684µs ±28%   661µs ±18%  -3.47%  (p=0.001 n=120+114)
BM_ThreadPool< 32, 10, 100>   6.85ms ±22%  6.62ms ±20%  -3.31%  (p=0.000 n=120+117)
BM_ThreadPool< 32, 10, 1000>  68.3ms ±24%  66.6ms ±23%  -2.39%  (p=0.005 n=120+118)
BM_ThreadPool< 64, 10, 10>     712µs ±39%   695µs ±21%    ~     (p=0.102 n=120+119)
BM_ThreadPool< 64, 10, 100>   7.22ms ±41%  6.89ms ±27%  -4.52%  (p=0.039 n=120+114)
BM_ThreadPool< 64, 10, 1000>  69.9ms ±26%  70.4ms ±34%    ~     (p=0.565 n=115+116)

name                          old time/op             new time/op             delta
BM_ThreadPool< 8, 10, 10>      174µs ± 6%              172µs ± 5%   -0.89%      (p=0.002 n=113+116)
BM_ThreadPool< 8, 10, 100>    1.75ms ± 8%             1.72ms ± 6%   -1.56%      (p=0.000 n=117+113)
BM_ThreadPool< 8, 10, 1000>   17.4ms ± 7%             17.1ms ± 5%   -1.82%      (p=0.000 n=116+112)
BM_ThreadPool< 16, 10, 10>     174µs ± 7%              172µs ± 6%   -1.46%      (p=0.000 n=116+115)
BM_ThreadPool< 16, 10, 100>   1.74ms ± 8%             1.72ms ± 6%   -1.53%      (p=0.000 n=118+115)
BM_ThreadPool< 16, 10, 1000>  17.4ms ± 6%             17.1ms ± 8%   -1.49%      (p=0.000 n=119+118)
BM_ThreadPool< 32, 10, 10>     171µs ± 8%              168µs ± 8%   -1.67%      (p=0.000 n=118+117)
BM_ThreadPool< 32, 10, 100>   1.70ms ± 9%             1.68ms ± 8%   -1.67%      (p=0.000 n=119+119)
BM_ThreadPool< 32, 10, 1000>  17.0ms ± 8%             16.7ms ± 5%   -2.16%      (p=0.000 n=119+113)
BM_ThreadPool< 64, 10, 10>     171µs ±14%              166µs ±13%   -3.01%      (p=0.000 n=119+115)
BM_ThreadPool< 64, 10, 100>   1.70ms ±15%             1.65ms ±12%   -3.10%      (p=0.000 n=117+114)
BM_ThreadPool< 64, 10, 1000>  16.9ms ±14%             16.5ms ±14%   -2.47%      (p=0.000 n=117+113)

name                          old INSTRUCTIONS/op     new INSTRUCTIONS/op     delta
BM_ThreadPool< 8, 10, 10>      1.59M ± 7%              1.28M ± 4%  -19.24%      (p=0.000 n=114+111)
BM_ThreadPool< 8, 10, 100>     15.0M ± 6%              12.8M ± 4%  -14.14%      (p=0.000 n=113+111)
BM_ThreadPool< 8, 10, 1000>     160M ± 8%               129M ± 4%  -19.31%      (p=0.000 n=112+112)
BM_ThreadPool< 16, 10, 10>     1.70M ± 6%              1.35M ± 4%  -20.59%      (p=0.000 n=109+108)
BM_ThreadPool< 16, 10, 100>    16.4M ± 5%              13.5M ± 4%  -17.71%      (p=0.000 n=103+105)
BM_ThreadPool< 16, 10, 1000>    172M ± 6%               135M ± 5%  -21.39%      (p=0.000 n=109+107)
BM_ThreadPool< 32, 10, 10>     1.68M ± 6%              1.39M ± 4%  -16.93%      (p=0.000 n=115+104)
BM_ThreadPool< 32, 10, 100>    16.7M ± 6%              13.9M ± 5%  -16.75%      (p=0.000 n=113+102)
BM_ThreadPool< 32, 10, 1000>    169M ± 7%               140M ± 5%  -16.98%      (p=0.000 n=111+104)
BM_ThreadPool< 64, 10, 10>     1.98M ±10%              1.63M ± 7%  -17.62%      (p=0.000 n=120+106)
BM_ThreadPool< 64, 10, 100>    19.7M ±10%              16.4M ± 8%  -17.09%      (p=0.000 n=111+112)
BM_ThreadPool< 64, 10, 1000>    198M ± 9%               163M ± 7%  -17.50%      (p=0.000 n=114+104)

name                          old CYCLES/op           new CYCLES/op           delta
BM_ThreadPool< 8, 10, 10>      1.05M ± 5%              1.06M ± 4%   +0.42%      (p=0.030 n=113+111)
BM_ThreadPool< 8, 10, 100>     10.5M ± 5%              10.6M ± 5%   +1.18%      (p=0.000 n=110+115)
BM_ThreadPool< 8, 10, 1000>     105M ± 5%               105M ± 5%     ~         (p=0.064 n=115+112)
BM_ThreadPool< 16, 10, 10>     1.07M ± 4%              1.07M ± 4%     ~         (p=0.795 n=109+112)
BM_ThreadPool< 16, 10, 100>    10.7M ± 6%              10.7M ± 5%     ~         (p=0.273 n=113+114)
BM_ThreadPool< 16, 10, 1000>    107M ± 5%               106M ± 5%   -0.49%      (p=0.029 n=115+112)
BM_ThreadPool< 32, 10, 10>     1.10M ± 5%              1.09M ± 5%   -0.65%      (p=0.016 n=110+112)
BM_ThreadPool< 32, 10, 100>    11.0M ± 5%              10.9M ± 5%   -1.10%      (p=0.000 n=115+111)
BM_ThreadPool< 32, 10, 1000>    110M ± 6%               108M ± 6%   -1.08%      (p=0.000 n=116+110)
BM_ThreadPool< 64, 10, 10>     1.18M ± 5%              1.20M ± 4%   +1.26%      (p=0.000 n=106+109)
BM_ThreadPool< 64, 10, 100>    11.8M ± 5%              11.9M ± 4%   +0.89%      (p=0.000 n=103+108)
BM_ThreadPool< 64, 10, 1000>    118M ± 6%               119M ± 6%   +0.66%      (p=0.012 n=112+109)
Edited by Eugene Zhulenev

Merge request reports

Loading