Commit 8564d218 authored by Christoph Conrads's avatar Christoph Conrads

Clean up RANLUX code

* have exactly one AWC/SWB-based generator for each word size
* use SWB(2^64,62,3) instead of SWB(2^64,4,26) because I know now that
  the period is at least 2^165
* add manually optimized RANLUX8 variants
* only use 128-bit integers if available
parent 655aa420
......@@ -9,13 +9,15 @@ CXXFLAGS := -O3 -march=native -DNDEBUG
.PHONY: run
all: run
run: benchmark
time -p stdbuf --output=L -- ./benchmark | tee benchmark.log
all: benchmark rlxt-vs-std
benchmark: benchmark.cpp random-number-engine.hpp
$(CXX) -Wextra -Wall -std=c++11 -pedantic \
$(CXXFLAGS) \
$< -o $@
rlxt-vs-std: rlxt-vs-std.cpp random-number-engine.hpp
$(CXX) -Wextra -Wall -std=c++11 -pedantic \
$(CXXFLAGS) \
$< -o $@
......@@ -15,7 +15,7 @@
#include <type_traits>
using namespace rademacher_fpl;
namespace rlxt = ranlux_tools;
......@@ -55,9 +55,11 @@ dummy_engine::result_type dummy_engine::operator() () { return state_++; }
std::string get_name(const dummy_engine&) { return "dummy-prng"; }
std::string get_name(const std::mt19937&) { return "mt19937"; }
std::string get_name(const std::mt19937_64&) { return "mt19937_64"; }
std::string get_name(const rademacher_fpl::xoshiro128plus&)
std::string get_name(const std::mt19937&) { return "std::mt19937"; }
std::string get_name(const std::mt19937_64&) { return "std::mt19937_64"; }
std::string get_name(const std::ranlux24&) { return "std::ranlux24"; }
std::string get_name(const ranlux_tools::xoshiro128plus&)
{
return "xoshiro128+";
}
......@@ -65,11 +67,10 @@ std::string get_name(const rademacher_fpl::xoshiro128plus&)
template<typename T, std::size_t W, std::size_t S, std::size_t R>
std::string get_name(
const add_with_carry_engine<T, W, S, R>&
std::string get_name(const ranlux_tools::add_with_carry_engine<T, W, S, R>&
)
{
constexpr auto FORMAT = "AWC(2^%zu, %2zu, %2zu)";
constexpr auto FORMAT = "AWC(2^%-2zu, %2zu, %1zu)";
char buffer[80] = { 0 };
snprintf(buffer, sizeof(buffer), FORMAT, W, R, S);
......@@ -79,26 +80,14 @@ std::string get_name(
template<typename T, std::size_t W, std::size_t P, std::size_t Q>
std::string get_name(const subtract_with_borrow_engine<T, W, P, Q>&)
{
constexpr auto FORMAT = "SWB(2^%zu, %2zu, %2zu)";
char buffer[80] = { 0 };
snprintf(buffer, sizeof(buffer), FORMAT, W, P, Q);
return buffer;
}
template<typename T, std::size_t W, std::size_t S, std::size_t R>
std::string get_name(
const std::subtract_with_carry_engine<T, W, S, R>&
const ranlux_tools::subtract_with_borrow_engine<T, W, P, Q>&
)
{
constexpr auto FORMAT = "SWC(2^%zu, %2zu, %2zu)";
constexpr auto FORMAT = "SWB(2^%-2zu, %2zu, %1zu)";
char buffer[80] = { 0 };
snprintf(buffer, sizeof(buffer), FORMAT, W, S, R);
snprintf(buffer, sizeof(buffer), FORMAT, W, P, Q);
return buffer;
}
......@@ -109,7 +98,7 @@ std::string get_name(const std::discard_block_engine<Generator, P, Q>& gen)
{
auto name = get_name(gen.base());
constexpr auto FORMAT = "[%3zu,%zu]";
constexpr auto FORMAT = "[%4zu,%2zu]";
char buffer[80] = { 0 };
snprintf(buffer, sizeof(buffer), FORMAT, P, Q);
......@@ -141,7 +130,6 @@ void run(std::uintmax_t num_draws)
Generator::max() == std::numeric_limits<std::uint8_t>::max() ? 1u : 0u
;
auto gen = Generator();
auto t_0 = get_cpu_time();
......@@ -162,32 +150,29 @@ void run(std::uintmax_t num_draws)
}
// the c++11 standard library ranlux has only luxury level 3 meaning it discards
// considerably less values than theoretically required
using ranlux24 =
std::discard_block_engine<std::ranlux24_base, 389u, 24u>;
int main()
{
constexpr auto num_draws = std::uintmax_t{1000} * 1000u * 1000u;
std::printf(
"%-25s | %10s | %20s | %s\n",
"%-28s | %10s | %20s | %s\n",
"generator", "time(sec)", "throughput(byte/sec)", "dummy"
);
run<ranlux16_awc_base>(num_draws);
run<std::ranlux24_base>(num_draws);
run<ranlux32_awc_base>(num_draws);
run<ranlux64_swc_base>(num_draws);
run<fast_ranlux16_awc>(num_draws);
run<fast_ranlux32_awc>(num_draws);
run<fast_ranlux64_swc>(num_draws);
run<ranlux16_awc>(num_draws);
run<ranlux24>(num_draws);
run<ranlux32_awc>(num_draws);
run<ranlux64_swc>(num_draws);
run<rlxt::ranlux8_base>(num_draws);
run<rlxt::ranlux16_base>(num_draws);
run<rlxt::ranlux32_base>(num_draws);
run<rlxt::ranlux64_base>(num_draws);
run<rlxt::fast_ranlux8>(num_draws);
run<rlxt::fast_ranlux16>(num_draws);
run<rlxt::fast_ranlux32>(num_draws);
run<rlxt::fast_ranlux64>(num_draws);
run<rlxt::ranlux8>(num_draws);
run<rlxt::ranlux16>(num_draws);
run<rlxt::ranlux32>(num_draws);
run<rlxt::ranlux64>(num_draws);
run<std::ranlux24>(num_draws);
run<std::mt19937>(num_draws);
run<xoshiro128plus>(num_draws);
run<std::mt19937_64>(num_draws);
run<rlxt::xoshiro128plus>(num_draws);
}
......@@ -14,14 +14,44 @@
#include <cstring>
#include <type_traits>
#if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16
#define RANLUX_TOOLS_HAS_INT128 1
#else
#define RANLUX_TOOLS_HAS_INT128 0
#endif
namespace rademacher_fpl {
namespace ranlux_tools {
namespace impl_random_number_engine
{
constexpr std::uint32_t rotl(std::uint32_t x, unsigned k)
{
return (x << k) | (x >> (32 - k));
}
#if RANLUX_TOOLS_HAS_INT128
#if __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpedantic"
#endif
template<std::size_t w>
using big_integer_t = typename std::conditional<
w == 16u, std::uint32_t, typename std::conditional<
w == 32u, std::uint64_t, typename std::conditional<
w == 64u, unsigned __int128, void
>::type>::type>::type;
#if __GNUC__
#pragma GCC diagnostic pop
#endif
#else
template<std::size_t w>
using big_integer_t = typename std::conditional<
w == 16u, std::uint32_t, typename std::conditional<
w == 32u, std::uint64_t, typename std::conditional<
w == 64u, unsigned __int128, void
>::type>::type>::type;
#endif
}
/**
......@@ -147,18 +177,14 @@ struct add_with_carry_engine
T operator() ()
{
using UInt = typename std::conditional<
w == 16u, std::uint32_t, typename std::conditional<
w == 32u, std::uint64_t, typename std::conditional<
w == 64u, unsigned __int128, void
>::type>::type>::type;
using BigInt = impl_random_number_engine::big_integer_t<w>;
auto i = index_;
auto j = index_ >= s ? index_ - s : index_ + r - s;
assert(carry_ == 0 or carry_ == 1);
auto x = UInt{xs_[i]} + xs_[j] + carry_;
auto x = BigInt{xs_[i]} + xs_[j] + carry_;
xs_[i] = x;
carry_ = x >> w;
......@@ -315,11 +341,7 @@ struct subtract_with_borrow_engine
T operator() ()
{
using UInt = typename std::conditional<
w == 16u, std::uint32_t, typename std::conditional<
w == 32u, std::uint64_t, typename std::conditional<
w == 64u, unsigned __int128, void
>::type>::type>::type;
using BigInt = impl_random_number_engine::big_integer_t<w>;
auto i = index_;
auto j = index_ >= s ? index_ - s : index_ + r - s;
......@@ -328,7 +350,7 @@ struct subtract_with_borrow_engine
assert(carry_ == 0 or carry_ == 1);
auto x = p > q ? UInt{u} - v - carry_ : UInt{v} - u - carry_;
auto x = p > q ? BigInt{u} - v - carry_ : BigInt{v} - u - carry_;
auto c = -T(x >> w);
xs_[i] = x;
......@@ -373,19 +395,23 @@ struct subtract_with_borrow_engine<std::uint8_t, 8u, 4u, 7u>
static constexpr T min() { return std::numeric_limits<T>::min(); }
explicit subtract_with_borrow_engine(std::uint32_t seed=default_seed)
explicit subtract_with_borrow_engine(std::uint64_t seed=default_seed)
{
auto gen = xoshiro128plus(seed);
auto init = (std::uint64_t{gen()} << 32) | gen();
for(auto& x : xs_)
x = gen();
std::memcpy(xs_, &init, r);
auto carry = xs_[0] == 0 and xs_[1] == 0 ? 1u : 0u;
xs_[r] = carry;
// ensure entering a periodic sequence
discard(r);
}
T operator() ()
result_type operator() ()
{
if(index_ > 0)
{
......@@ -401,26 +427,27 @@ struct subtract_with_borrow_engine<std::uint8_t, 8u, 4u, 7u>
auto y = std::uint32_t{0};
auto p_y = reinterpret_cast<char*>(xs_ + r - s);
static_assert(sizeof(x) == s * sizeof(T), "");
static_assert(sizeof(x) == s * sizeof(result_type), "");
std::memcpy(&x, p_x, sizeof(x));
std::memcpy(&y, p_y, sizeof(y));
auto z = std::uint64_t{y} - x - carry_;
auto carry = std::uint32_t{xs_[r]};
auto z = std::uint64_t{y} - x - carry;
std::memcpy(p_x, &z, sizeof(x));
carry_ = -T(z >> 32);
carry = -std::uint32_t(z >> 32);
y = std::uint32_t(z) & ((std::uint32_t{1} << 24) - 1u);
xs_[r] = 0;
std::memcpy(&x, p_x + s, sizeof(x));
z = std::uint64_t{y} - x - carry_;
z = std::uint64_t{y} - x - carry;
std::memcpy(p_x + s, &z, sizeof(x));
carry_ = -T(z >> 24);
xs_[r] = -std::uint32_t(z >> 24);
index_ = 1u;
return xs_[0];
......@@ -436,74 +463,118 @@ struct subtract_with_borrow_engine<std::uint8_t, 8u, 4u, 7u>
std::size_t index_ = 0;
T carry_ = 0;
T xs_[8] = { 0 };
std::uint8_t xs_[r+1] = { 0 };
};
template<>
struct subtract_with_borrow_engine<std::uint8_t, 8u, 5u, 8u>
{
static constexpr auto w = std::size_t{8};
static constexpr auto r = std::size_t{8};
static constexpr auto s = std::size_t{5};
static constexpr auto long_lag = r;
static constexpr auto short_lag = s;
static constexpr auto word_size = w;
using T = std::uint8_t;
using result_type = T;
static constexpr auto default_seed = std::uint32_t{387853};
static constexpr T max() { return std::numeric_limits<T>::max(); }
static constexpr T min() { return std::numeric_limits<T>::min(); }
explicit subtract_with_borrow_engine(std::uint64_t seed=default_seed)
{
auto gen = xoshiro128plus(seed);
auto init = (std::uint64_t{gen()} << 32) | gen();
std::memcpy(xs_, &init, r);
carry_ = init == 0 ? 1u : 0u;
// ensure entering a periodic sequence
discard(r);
}
result_type operator() ()
{
if(index_ > 0)
{
auto ret = xs_[index_];
index_ = (index_ + 1u) & 7u;
return ret;
}
// bytes 0..3
auto x = std::uint32_t{0};
auto p_x = reinterpret_cast<char*>(xs_ + 0);
auto y = std::uint32_t{0};
auto p_y = reinterpret_cast<char*>(xs_ + r - s);
std::memcpy(&x, p_x, sizeof(x));
std::memcpy(&y, p_y, sizeof(y));
auto z = std::uint64_t{y} - x - carry_;
std::memcpy(p_x, &z, sizeof(x));
// RANLUX subtract-with-borrow(2^w, s, r)
using ranlux16_base =
std::subtract_with_carry_engine<std::uint16_t, 16u, 3u, 11u>
;
using ranlux32_base =
std::subtract_with_carry_engine<std::uint32_t, 32u, 3u, 17u>
;
using ranlux64_base =
std::subtract_with_carry_engine<std::uint64_t, 64u, 4u, 26u>
;
carry_ = -std::uint32_t(z >> 32);
using ranlux16 = std::discard_block_engine<ranlux16_base, 127u, 11u>;
using ranlux32 = std::discard_block_engine<ranlux32_base, 293u, 17u>;
using ranlux64 = std::discard_block_engine<ranlux64_base, 787u, 26u>;
// bytes 4..7
std::memcpy(&x, p_x + 4, sizeof(x));
std::memcpy(&y, p_x + 0, sizeof(y));
using fast_ranlux16 = std::discard_block_engine<ranlux16_base, 37u, 11u>;
using fast_ranlux32 = std::discard_block_engine<ranlux32_base, 73u, 17u>;
using fast_ranlux64 = std::discard_block_engine<ranlux64_base, 197u, 26u>;
y = (y << 8) | xs_[r-1];
z = std::uint64_t{y} - x - carry_;
std::memcpy(p_x + 4, &z, sizeof(x));
carry_ = -std::uint32_t(z >> 32);
index_ = 1u;
// RANLUX add-with-carry(2^w, r, s)
using ranlux16_awc_base = add_with_carry_engine<std::uint16_t, 16u, 2u, 9u>;
using ranlux32_awc_base = add_with_carry_engine<std::uint32_t, 32u, 3u, 16u>;
using ranlux64_awc_base = add_with_carry_engine<std::uint64_t, 64u, 14u, 25u>;
return xs_[0];
}
using ranlux16_awc = std::discard_block_engine<ranlux16_awc_base, 97u, 9u>;
using ranlux32_awc = std::discard_block_engine<ranlux32_awc_base, 277u, 16u>;
using fast_ranlux16_awc = std::discard_block_engine<ranlux16_awc_base, 23, 9u>;
using fast_ranlux32_awc = std::discard_block_engine<ranlux32_awc_base, 71u, 16u>;
void discard(unsigned long long n)
{
for(auto i = 0ull; i < n; ++i)
(*this)();
}
// RANLUX subtract-with-borrow(2^w, p, q)
using ranlux8_base =
std::subtract_with_carry_engine<std::uint8_t, 8u, 4u, 7u>;
std::size_t index_ = 0;
std::uint8_t xs_[8] = { 0 };
std::uint32_t carry_ = 0;
};
using ranlux8_swc_base =
subtract_with_borrow_engine<std::uint8_t, 8u, 4u, 7u>;
using ranlux16_swb_base =
subtract_with_borrow_engine<std::uint16_t, 16u, 33u, 5u>;
using ranlux32_swb_base =
subtract_with_borrow_engine<std::uint32_t, 32u, 30u, 4u>;
using ranlux64_swb_base =
subtract_with_borrow_engine<std::uint64_t, 64u, 15u, 2u>;
using ranlux8_base = subtract_with_borrow_engine<std::uint8_t, 8u, 5u, 8u>;
using ranlux8 = std::discard_block_engine<ranlux8_base, 67u, 8u>;
using fast_ranlux8 = std::discard_block_engine<ranlux8_base, 17u, 8u>;
using ranlux16_swc_base =
subtract_with_borrow_engine<std::uint16_t, 16u, 3u, 11u>;
using ranlux32_swc_base =
subtract_with_borrow_engine<std::uint32_t, 32u, 3u, 17u>;
using ranlux64_swc_base =
subtract_with_borrow_engine<std::uint64_t, 64u, 4u, 26u>;
using ranlux16_base = add_with_carry_engine<std::uint16_t, 16u, 2u, 9u>;
using ranlux16 = std::discard_block_engine<ranlux16_base, 97u, 9u>;
using fast_ranlux16 = std::discard_block_engine<ranlux16_base, 23u, 9u>;
using ranlux64_swc = std::discard_block_engine<ranlux64_swc_base, 787u, 26u>;
using fast_ranlux64_swc =
std::discard_block_engine<ranlux64_swc_base, 197u, 26u>;
using ranlux32_base = add_with_carry_engine<std::uint32_t, 32u, 3u, 16u>;
using ranlux32 = std::discard_block_engine<ranlux32_base, 277u, 16u>;
using fast_ranlux32 = std::discard_block_engine<ranlux32_base, 71u, 16u>;
/**
* Use this engine if you have no clue about pseudo-random number generators.
*/
using default_engine = ranlux32_awc;
#if RANLUX_TOOLS_HAS_INT128
using ranlux64_base = subtract_with_borrow_engine<std::uint64_t, 64u, 62u, 3u>;
using ranlux64 = std::discard_block_engine<ranlux64_base, 1303u, 62u>;
using fast_ranlux64 = std::discard_block_engine<ranlux64_base, 331u, 62u>;
#endif
}
......
......@@ -11,17 +11,40 @@
// library class `std::subtract_with_carry_engine`.
#include <cassert>
#include "random-number-engine.hpp"
#include <cstdio>
#include <cstdint>
#include <ctime>
#include <limits>
#include <random>
#include "random-number-engine.hpp"
#include <string>
#include <type_traits>
using namespace rademacher_fpl;
namespace rlxt = ranlux_tools;
using ranlux8_base =
std::subtract_with_carry_engine<std::uint8_t, 8u, 4u, 7u>;
using ranlux16_base =
std::subtract_with_carry_engine<std::uint16_t, 16u, 3u, 11u>;
using ranlux32_base =
std::subtract_with_carry_engine<std::uint32_t, 32u, 3u, 17u>;
using ranlux64_base =
std::subtract_with_carry_engine<std::uint64_t, 64u, 4u, 26u>;
using ranlux8_swb47_base =
rlxt::subtract_with_borrow_engine<std::uint8_t, 8u, 4u, 7u>;
using ranlux8_swb58_base =
rlxt::subtract_with_borrow_engine<std::uint8_t, 8u, 5u, 8u>;
using ranlux16_swb_base =
rlxt::subtract_with_borrow_engine<std::uint16_t, 16u, 3u, 11u>;
using ranlux32_swb_base =
rlxt::subtract_with_borrow_engine<std::uint32_t, 32u, 3u, 17u>;
#if RANLUX_TOOLS_HAS_INT128
using ranlux64_swb_base =
rlxt::subtract_with_borrow_engine<std::uint64_t, 64u, 4u, 26u>;
#endif
......@@ -41,10 +64,14 @@ std::uintmax_t get_time_nsec(const timespec& tm)
}
std::string get_name(const std::ranlux24_base&) { return "std::ranlux24_base"; }
template<typename T, std::size_t W, std::size_t P, std::size_t Q>
std::string get_name(const subtract_with_borrow_engine<T, W, P, Q>&)
std::string get_name(const rlxt::subtract_with_borrow_engine<T, W, P, Q>&)
{
constexpr auto FORMAT = "RLX-SWB(2^%zu, %2zu, %2zu)";
constexpr auto FORMAT = "RLX-SWB(2^%-2zu, %1zu, %2zu)";
char buffer[80] = { 0 };
snprintf(buffer, sizeof(buffer), FORMAT, W, P, Q);
......@@ -58,7 +85,7 @@ std::string get_name(
const std::subtract_with_carry_engine<T, W, S, R>&
)
{
constexpr auto FORMAT = "STD-SWC(2^%zu, %2zu, %2zu)";
constexpr auto FORMAT = "STD-SWC(2^%-2zu, %1zu, %2zu)";
char buffer[80] = { 0 };
snprintf(buffer, sizeof(buffer), FORMAT, W, S, R);
......@@ -86,10 +113,10 @@ void run(std::uintmax_t num_draws)
Generator::max() == std::numeric_limits<std::uint64_t>::max() ? 8u :
Generator::max() == std::numeric_limits<std::uint32_t>::max() ? 4u :
Generator::max() == (1u<<24) - 1u ? 3u :
Generator::max() == std::numeric_limits<std::uint16_t>::max() ? 2u : 0u
Generator::max() == std::numeric_limits<std::uint16_t>::max() ? 2u :
Generator::max() == std::numeric_limits<std::uint8_t>::max() ? 1u : 0u
;
auto gen = Generator();
auto t_0 = get_cpu_time();
......@@ -125,10 +152,16 @@ int main()
"generator", "time(sec)", "throughput(byte/sec)", "dummy"
);
run<ranlux8_base>(num_draws);
run<ranlux8_swb47_base>(num_draws);
run<ranlux8_swb58_base>(num_draws);
run<ranlux16_base>(num_draws);
run<ranlux16_swc_base>(num_draws);
run<ranlux16_swb_base>(num_draws);
run<std::ranlux24_base>(num_draws);
run<ranlux32_base>(num_draws);
run<ranlux32_swc_base>(num_draws);
run<ranlux32_swb_base>(num_draws);
#if RANLUX_TOOLS_HAS_INT128
run<ranlux64_base>(num_draws);
run<ranlux64_swc_base>(num_draws);
run<ranlux64_swb_base>(num_draws);
#endif
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment