Commit 2c505e66 authored by Ondrej Mosnáček's avatar Ondrej Mosnáček

Tune both lanes and jobs per block

This commit extends block size tuning to both lanes and jobs (for oneshot
kernel, only jobs per block are tuned). For now, we only tune jobs per block
if lanes per block was tuned to its maximum value.
parent 5af39451
......@@ -28,20 +28,18 @@ private:
void precomputeRefs();
void runKernelSegment(std::uint32_t blockSize,
void runKernelSegment(std::uint32_t lanesPerBlock,
std::uint32_t jobsPerBlock,
std::uint32_t pass, std::uint32_t slice);
void runKernelOneshot(std::uint32_t blockSize);
static uint32_t checkPowerOf2(uint32_t v)
{
return (v & (v - 1)) == 0 ? v : 1;
}
void runKernelOneshot(std::uint32_t lanesPerBlock,
std::uint32_t jobsPerBlock);
public:
std::uint32_t getMaxBlockSize() const
{
return checkPowerOf2(bySegment ? lanes : batchSize);
}
std::uint32_t getMinLanesPerBlock() const { return bySegment ? 1 : lanes; }
std::uint32_t getMaxLanesPerBlock() const { return lanes; }
std::uint32_t getMinJobsPerBlock() const { return 1; }
std::uint32_t getMaxJobsPerBlock() const { return batchSize; }
std::uint32_t getBatchSize() const { return batchSize; }
void *getMemory() const { return memory; }
......@@ -52,7 +50,7 @@ public:
bool bySegment, bool precompute);
~Argon2KernelRunner();
void run(std::uint32_t blockSize);
void run(std::uint32_t lanesPerBlock, std::uint32_t jobsPerBlock);
float finish();
};
......
......@@ -18,7 +18,8 @@ private:
const Device *device;
Argon2KernelRunner runner;
uint32_t bestBlockSize;
uint32_t bestLanesPerBlock;
uint32_t bestJobsPerBlock;
public:
class PasswordWriter
......
......@@ -793,16 +793,22 @@ Argon2KernelRunner::~Argon2KernelRunner()
}
}
void Argon2KernelRunner::runKernelSegment(uint32_t blockSize,
void Argon2KernelRunner::runKernelSegment(uint32_t lanesPerBlock,
uint32_t jobsPerBlock,
uint32_t pass, uint32_t slice)
{
if (blockSize > lanes || lanes % blockSize != 0) {
throw std::logic_error("Invalid blockSize!");
if (lanesPerBlock > lanes || lanes % lanesPerBlock != 0) {
throw std::logic_error("Invalid lanesPerBlock!");
}
if (jobsPerBlock > batchSize || batchSize % jobsPerBlock != 0) {
throw std::logic_error("Invalid jobsPerBlock!");
}
struct block_g *memory_blocks = (struct block_g *)memory;
dim3 blocks = dim3(1, lanes / blockSize, batchSize);
dim3 threads = dim3(THREADS_PER_LANE, blockSize);
dim3 blocks = dim3(1, lanes / lanesPerBlock, batchSize / jobsPerBlock);
dim3 threads = dim3(THREADS_PER_LANE, lanesPerBlock, jobsPerBlock);
uint32_t blockSize = lanesPerBlock * jobsPerBlock;
if (type == ARGON2_I) {
if (precompute) {
uint32_t shared_size = blockSize * ARGON2_BLOCK_SIZE * 2;
......@@ -848,18 +854,24 @@ void Argon2KernelRunner::runKernelSegment(uint32_t blockSize,
}
}
void Argon2KernelRunner::runKernelOneshot(uint32_t blockSize)
void Argon2KernelRunner::runKernelOneshot(uint32_t lanesPerBlock,
uint32_t jobsPerBlock)
{
if (blockSize > batchSize || batchSize % blockSize != 0) {
throw std::logic_error("Invalid blockSize!");
if (lanesPerBlock != lanes) {
throw std::logic_error("Invalid lanesPerBlock!");
}
if (jobsPerBlock > batchSize || batchSize % jobsPerBlock != 0) {
throw std::logic_error("Invalid jobsPerBlock!");
}
struct block_g *memory_blocks = (struct block_g *)memory;
dim3 blocks = dim3(1, 1, batchSize / blockSize);
dim3 threads = dim3(THREADS_PER_LANE, lanes, blockSize);
dim3 blocks = dim3(1, 1, batchSize / jobsPerBlock);
dim3 threads = dim3(THREADS_PER_LANE, lanes, jobsPerBlock);
uint32_t blockSize = lanesPerBlock * jobsPerBlock;
if (type == ARGON2_I) {
if (precompute) {
uint32_t shared_size = lanes * ARGON2_BLOCK_SIZE * 2;
uint32_t shared_size = blockSize * ARGON2_BLOCK_SIZE * 2;
struct ref *refs = (struct ref *)this->refs;
if (version == ARGON2_VERSION_10) {
argon2i_kernel_oneshot_precompute<ARGON2_VERSION_10>
......@@ -871,7 +883,7 @@ void Argon2KernelRunner::runKernelOneshot(uint32_t blockSize)
memory_blocks, refs, passes, lanes, segmentBlocks);
}
} else {
uint32_t shared_size = lanes * ARGON2_BLOCK_SIZE * 3;
uint32_t shared_size = blockSize * ARGON2_BLOCK_SIZE * 3;
if (version == ARGON2_VERSION_10) {
argon2_kernel_oneshot<ARGON2_I, ARGON2_VERSION_10>
<<<blocks, threads, shared_size, stream>>>(
......@@ -883,7 +895,7 @@ void Argon2KernelRunner::runKernelOneshot(uint32_t blockSize)
}
}
} else {
uint32_t shared_size = lanes * ARGON2_BLOCK_SIZE * 2;
uint32_t shared_size = blockSize * ARGON2_BLOCK_SIZE * 2;
if (version == ARGON2_VERSION_10) {
argon2_kernel_oneshot<ARGON2_D, ARGON2_VERSION_10>
<<<blocks, threads, shared_size, stream>>>(
......@@ -896,18 +908,18 @@ void Argon2KernelRunner::runKernelOneshot(uint32_t blockSize)
}
}
void Argon2KernelRunner::run(uint32_t blockSize)
void Argon2KernelRunner::run(uint32_t lanesPerBlock, uint32_t jobsPerBlock)
{
CudaException::check(cudaEventRecord(start, stream));
if (bySegment) {
for (uint32_t pass = 0; pass < passes; pass++) {
for (uint32_t slice = 0; slice < ARGON2_SYNC_POINTS; slice++) {
runKernelSegment(blockSize, pass, slice);
runKernelSegment(lanesPerBlock, jobsPerBlock, pass, slice);
}
}
} else {
runKernelOneshot(blockSize);
runKernelOneshot(lanesPerBlock, jobsPerBlock);
}
CudaException::check(cudaGetLastError());
......
......@@ -20,43 +20,84 @@ ProcessingUnit::ProcessingUnit(
programContext->getArgon2Version(), params->getTimeCost(),
params->getLanes(), params->getSegmentBlocks(), batchSize,
bySegment, precomputeRefs),
bestBlockSize(1)
bestLanesPerBlock(runner.getMinLanesPerBlock()),
bestJobsPerBlock(runner.getMinJobsPerBlock())
{
CudaException::check(cudaSetDevice(device->getDeviceIndex()));
if (runner.getMaxBlockSize() > 1) {
if (runner.getMaxLanesPerBlock() > runner.getMinLanesPerBlock()) {
#ifndef NDEBUG
std::cerr << "[INFO] Benchmarking block size..." << std::endl;
std::cerr << "[INFO] Tuning lanes per block..." << std::endl;
#endif
float bestTime = std::numeric_limits<float>::infinity();
for (std::uint32_t blockSize = 1; blockSize <= runner.getMaxBlockSize();
blockSize *= 2)
for (std::uint32_t lpb = 1; lpb <= runner.getMaxLanesPerBlock();
lpb *= 2)
{
float time;
try {
runner.run(blockSize);
runner.run(lpb, bestJobsPerBlock);
time = runner.finish();
} catch(CudaException &ex) {
#ifndef NDEBUG
std::cerr << "[WARN] Exception on block size " << blockSize
<< ": " << ex.what() << std::endl;
std::cerr << "[WARN] CUDA error on " << lpb
<< "lanes per block: " << ex.what() << std::endl;
#endif
break;
}
#ifndef NDEBUG
std::cerr << "[INFO] Block size " << blockSize << ": "
std::cerr << "[INFO] " << lpb << " lanes per block: "
<< time << " ms" << std::endl;
#endif
if (time < bestTime) {
bestTime = time;
bestBlockSize = blockSize;
bestLanesPerBlock = lpb;
}
}
#ifndef NDEBUG
std::cerr << "[INFO] Picked block size: " << bestBlockSize << std::endl;
std::cerr << "[INFO] Picked " << bestLanesPerBlock
<< " lanes per block." << std::endl;
#endif
}
/* Only tune jobs per block if we hit maximum lanes per block: */
if (bestLanesPerBlock == runner.getMaxLanesPerBlock()
&& runner.getMaxJobsPerBlock() > runner.getMinJobsPerBlock()) {
#ifndef NDEBUG
std::cerr << "[INFO] Tuning jobs per block..." << std::endl;
#endif
float bestTime = std::numeric_limits<float>::infinity();
for (std::uint32_t jpb = 1; jpb <= runner.getMaxJobsPerBlock();
jpb *= 2)
{
float time;
try {
runner.run(bestLanesPerBlock, jpb);
time = runner.finish();
} catch(CudaException &ex) {
#ifndef NDEBUG
std::cerr << "[WARN] CUDA error on " << jpb
<< " jobs per block: " << ex.what() << std::endl;
#endif
break;
}
#ifndef NDEBUG
std::cerr << "[INFO] " << jpb << " jobs per block: "
<< time << " ms" << std::endl;
#endif
if (time < bestTime) {
bestTime = time;
bestJobsPerBlock = jpb;
}
}
#ifndef NDEBUG
std::cerr << "[INFO] Picked " << bestJobsPerBlock
<< " jobs per block." << std::endl;
#endif
}
}
......@@ -115,7 +156,7 @@ const void *ProcessingUnit::HashReader::getHash() const
void ProcessingUnit::beginProcessing()
{
CudaException::check(cudaSetDevice(device->getDeviceIndex()));
runner.run(bestBlockSize);
runner.run(bestLanesPerBlock, bestJobsPerBlock);
}
void ProcessingUnit::endProcessing()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment