Commit 58141506 authored by Ondrej Mosnáček's avatar Ondrej Mosnáček Committed by Ondrej Mosnáček

[CUDA,OpenCL] Use RAM buffer for in/out blocks

This allows to do the CPU pre-/post-processing of a password batch to
be done in parallel with the GPU computation. This means we can now
assume the BLAKE2 computation cost to be hidden behind the GPU
computation time (for real).

This only adds the overhead of copying the data from/to the RAM buffer
to the GPU computation time, but this is fast thanks to the rectangular
copy operations that are used. This should significantly affect only
hashes with low cost parameters. For these the benchmark tool was
reporting too optimistic times before this commit.
parent cb20c9f1
Pipeline #52558231 passed with stages
in 58 minutes and 1 second
......@@ -5,6 +5,7 @@
#include <cuda_runtime.h>
#include <cstdint>
#include <memory>
/* workaround weird CMake/CUDA bug: */
#ifdef argon2
......@@ -23,11 +24,17 @@ private:
bool bySegment;
bool precompute;
cudaEvent_t start, end;
cudaEvent_t start, end, kernelStart, kernelEnd;
cudaStream_t stream;
void *memory;
void *refs;
std::unique_ptr<std::uint8_t[]> blocksIn;
std::unique_ptr<std::uint8_t[]> blocksOut;
void copyInputBlocks();
void copyOutputBlocks();
void precomputeRefs();
void runKernelSegment(std::uint32_t lanesPerBlock,
......@@ -51,8 +58,8 @@ public:
bool bySegment, bool precompute);
~KernelRunner();
void writeInputMemory(std::size_t jobId, const void *buffer);
void readOutputMemory(std::size_t jobId, void *buffer);
void *getInputMemory(std::size_t jobId) const;
const void *getOutputMemory(std::size_t jobId) const;
void run(std::uint32_t lanesPerBlock, std::uint32_t jobsPerBlock);
float finish();
......
......@@ -31,7 +31,11 @@ public:
const Device *device, std::size_t batchSize,
bool bySegment = true, bool precomputeRefs = false);
/* You can safely call this function after the beginProcessing() call to
* prepare the next batch: */
void setPassword(std::size_t index, const void *pw, std::size_t pwSize);
/* You can safely call this function after the beginProcessing() call to
* process the previous batch: */
void getHash(std::size_t index, void *hash);
void beginProcessing();
......
......@@ -4,6 +4,8 @@
#include "programcontext.h"
#include "argon2-gpu-common/argon2params.h"
#include <memory>
namespace argon2 {
namespace opencl {
......@@ -20,10 +22,16 @@ private:
cl::CommandQueue queue;
cl::Kernel kernel;
cl::Buffer memoryBuffer, refsBuffer;
cl::Event start, end;
cl::Event start, end, kernelStart, kernelEnd;
std::size_t memorySize;
std::unique_ptr<std::uint8_t[]> blocksIn;
std::unique_ptr<std::uint8_t[]> blocksOut;
void copyInputBlocks();
void copyOutputBlocks();
void precomputeRefs();
public:
......@@ -38,16 +46,21 @@ public:
std::size_t getBatchSize() const { return batchSize; }
void *getInputMemory(std::uint32_t jobId) const
{
std::size_t copySize = params->getLanes() * 2 * ARGON2_BLOCK_SIZE;
return blocksIn.get() + jobId * copySize;
}
const void *getOutputMemory(std::uint32_t jobId) const
{
std::size_t copySize = params->getLanes() * ARGON2_BLOCK_SIZE;
return blocksOut.get() + jobId * copySize;
}
KernelRunner(const ProgramContext *programContext,
const Argon2Params *params, const Device *device,
std::size_t batchSize, bool bySegment, bool precompute);
void *mapInputMemory(std::size_t jobId);
void unmapInputMemory(void *memory);
void *mapOutputMemory(std::size_t jobId);
void unmapOutputMemory(void *memory);
void run(std::uint32_t lanesPerBlock, std::uint32_t jobsPerBlock);
float finish();
};
......
......@@ -27,7 +27,11 @@ public:
const Device *device, std::size_t batchSize,
bool bySegment = true, bool precomputeRefs = false);
/* You can safely call this function after the beginProcessing() call to
* prepare the next batch: */
void setPassword(std::size_t index, const void *pw, std::size_t pwSize);
/* You can safely call this function after the beginProcessing() call to
* process the previous batch: */
void getHash(std::size_t index, void *hash);
void beginProcessing();
......
......@@ -802,8 +802,10 @@ KernelRunner::KernelRunner(uint32_t type, uint32_t version, uint32_t passes,
size_t batchSize, bool bySegment, bool precompute)
: type(type), version(version), passes(passes), lanes(lanes),
segmentBlocks(segmentBlocks), batchSize(batchSize), bySegment(bySegment),
precompute(precompute), stream(nullptr), memory(nullptr),
refs(nullptr), start(nullptr), end(nullptr)
precompute(precompute), stream(), memory(), refs(),
start(), end(), kernelStart(), kernelEnd(),
blocksIn(new uint8_t[batchSize * lanes * 2 * ARGON2_BLOCK_SIZE]),
blocksOut(new uint8_t[batchSize * lanes * ARGON2_BLOCK_SIZE])
{
// FIXME: check overflow:
size_t memorySize = batchSize * lanes * segmentBlocks
......@@ -818,6 +820,8 @@ KernelRunner::KernelRunner(uint32_t type, uint32_t version, uint32_t passes,
CudaException::check(cudaEventCreate(&start));
CudaException::check(cudaEventCreate(&end));
CudaException::check(cudaEventCreate(&kernelStart));
CudaException::check(cudaEventCreate(&kernelEnd));
CudaException::check(cudaStreamCreate(&stream));
......@@ -886,28 +890,42 @@ KernelRunner::~KernelRunner()
}
}
void KernelRunner::writeInputMemory(size_t jobId, const void *buffer)
void *KernelRunner::getInputMemory(size_t jobId) const
{
std::size_t memorySize = static_cast<size_t>(lanes) * segmentBlocks
size_t copySize = lanes * 2 * ARGON2_BLOCK_SIZE;
return blocksIn.get() + jobId * copySize;
}
const void *KernelRunner::getOutputMemory(size_t jobId) const
{
size_t copySize = lanes * ARGON2_BLOCK_SIZE;
return blocksOut.get() + jobId * copySize;
}
void KernelRunner::copyInputBlocks()
{
size_t jobSize = static_cast<size_t>(lanes) * segmentBlocks
* ARGON2_SYNC_POINTS * ARGON2_BLOCK_SIZE;
std::size_t size = static_cast<size_t>(lanes) * 2 * ARGON2_BLOCK_SIZE;
std::size_t offset = memorySize * jobId;
auto mem = static_cast<uint8_t *>(memory) + offset;
CudaException::check(cudaMemcpyAsync(mem, buffer, size,
cudaMemcpyHostToDevice, stream));
CudaException::check(cudaStreamSynchronize(stream));
size_t copySize = lanes * 2 * ARGON2_BLOCK_SIZE;
CudaException::check(cudaMemcpy2DAsync(
memory, jobSize,
blocksIn.get(), copySize,
copySize, batchSize, cudaMemcpyHostToDevice,
stream));
}
void KernelRunner::readOutputMemory(size_t jobId, void *buffer)
void KernelRunner::copyOutputBlocks()
{
std::size_t memorySize = static_cast<size_t>(lanes) * segmentBlocks
size_t jobSize = static_cast<size_t>(lanes) * segmentBlocks
* ARGON2_SYNC_POINTS * ARGON2_BLOCK_SIZE;
std::size_t size = static_cast<size_t>(lanes) * ARGON2_BLOCK_SIZE;
std::size_t offset = memorySize * (jobId + 1) - size;
auto mem = static_cast<uint8_t *>(memory) + offset;
CudaException::check(cudaMemcpyAsync(buffer, mem, size,
cudaMemcpyDeviceToHost, stream));
CudaException::check(cudaStreamSynchronize(stream));
size_t copySize = lanes * ARGON2_BLOCK_SIZE;
uint8_t *mem = static_cast<uint8_t *>(memory);
CudaException::check(cudaMemcpy2DAsync(
blocksOut.get(), copySize,
mem + (jobSize - copySize), jobSize,
copySize, batchSize, cudaMemcpyDeviceToHost,
stream));
}
void KernelRunner::runKernelSegment(uint32_t lanesPerBlock,
......@@ -1075,6 +1093,10 @@ void KernelRunner::run(uint32_t lanesPerBlock, uint32_t jobsPerBlock)
{
CudaException::check(cudaEventRecord(start, stream));
copyInputBlocks();
CudaException::check(cudaEventRecord(kernelStart, stream));
if (bySegment) {
for (uint32_t pass = 0; pass < passes; pass++) {
for (uint32_t slice = 0; slice < ARGON2_SYNC_POINTS; slice++) {
......@@ -1087,15 +1109,27 @@ void KernelRunner::run(uint32_t lanesPerBlock, uint32_t jobsPerBlock)
CudaException::check(cudaGetLastError());
CudaException::check(cudaEventRecord(kernelEnd, stream));
copyOutputBlocks();
CudaException::check(cudaEventRecord(end, stream));
}
float KernelRunner::finish()
{
float time = 0.0;
CudaException::check(cudaStreamSynchronize(stream));
float time = 0.0;
CudaException::check(cudaEventElapsedTime(&time, start, end));
#ifndef NDEBUG
CudaException::check(cudaEventElapsedTime(&time, start, kernelStart));
std::cerr << "[INFO] Copy to device took " << time << " ms." << std::endl;
CudaException::check(cudaEventElapsedTime(&time, kernelEnd, end));
std::cerr << "[INFO] Copy from device took " << time << " ms." << std::endl;
#endif
CudaException::check(cudaEventElapsedTime(&time, kernelStart, kernelEnd));
return time;
}
......
......@@ -125,20 +125,14 @@ ProcessingUnit::ProcessingUnit(
void ProcessingUnit::setPassword(std::size_t index, const void *pw,
std::size_t pwSize)
{
std::size_t size = params->getLanes() * 2 * ARGON2_BLOCK_SIZE;
auto buffer = std::unique_ptr<uint8_t[]>(new uint8_t[size]);
params->fillFirstBlocks(buffer.get(), pw, pwSize,
params->fillFirstBlocks(runner.getInputMemory(index), pw, pwSize,
programContext->getArgon2Type(),
programContext->getArgon2Version());
runner.writeInputMemory(index, buffer.get());
}
void ProcessingUnit::getHash(std::size_t index, void *hash)
{
std::size_t size = params->getLanes() * ARGON2_BLOCK_SIZE;
auto buffer = std::unique_ptr<uint8_t[]>(new uint8_t[size]);
runner.readOutputMemory(index, buffer.get());
params->finalize(hash, buffer.get());
params->finalize(hash, runner.getOutputMemory(index));
}
void ProcessingUnit::beginProcessing()
......
......@@ -15,12 +15,31 @@ enum {
ARGON2_REFS_PER_BLOCK = ARGON2_BLOCK_SIZE / (2 * sizeof(cl_uint)),
};
static float getDurationInMs(const cl::Event &start, const cl::Event &end)
{
cl_ulong nsStart = start.getProfilingInfo<CL_PROFILING_COMMAND_END>();
cl_ulong nsEnd = end.getProfilingInfo<CL_PROFILING_COMMAND_END>();
return (nsEnd - nsStart) / (1000.0F * 1000.0F);
}
static cl::size_t<3> makeSize3(std::size_t x, std::size_t y, std::size_t z)
{
cl::size_t<3> res;
res[0] = x;
res[1] = y;
res[2] = z;
return res;
}
KernelRunner::KernelRunner(const ProgramContext *programContext,
const Argon2Params *params, const Device *device,
std::size_t batchSize, bool bySegment, bool precompute)
: programContext(programContext), params(params), batchSize(batchSize),
bySegment(bySegment), precompute(precompute),
memorySize(params->getMemorySize() * batchSize)
memorySize(params->getMemorySize() * batchSize),
blocksIn(new std::uint8_t[batchSize * params->getLanes() * 2 * ARGON2_BLOCK_SIZE]),
blocksOut(new std::uint8_t[batchSize * params->getLanes() * ARGON2_BLOCK_SIZE])
{
auto context = programContext->getContext();
std::uint32_t passes = params->getTimeCost();
......@@ -110,32 +129,27 @@ void KernelRunner::precomputeRefs()
queue.finish();
}
void *KernelRunner::mapInputMemory(std::size_t jobId)
void KernelRunner::copyInputBlocks()
{
std::size_t memorySize = params->getMemorySize();
std::size_t mappedSize = params->getLanes() * 2 * ARGON2_BLOCK_SIZE;
return queue.enqueueMapBuffer(memoryBuffer, true, CL_MAP_WRITE,
memorySize * jobId, mappedSize);
}
std::size_t jobSize = params->getMemorySize();
std::size_t copySize = params->getLanes() * 2 * ARGON2_BLOCK_SIZE;
void KernelRunner::unmapInputMemory(void *memory)
{
queue.enqueueUnmapMemObject(memoryBuffer, memory);
queue.enqueueWriteBufferRect(memoryBuffer, false,
makeSize3(0, 0, 0), makeSize3(0, 0, 0),
makeSize3(copySize, batchSize, 1),
jobSize, 0, copySize, 0, blocksIn.get());
}
void *KernelRunner::mapOutputMemory(std::size_t jobId)
void KernelRunner::copyOutputBlocks()
{
std::size_t memorySize = params->getMemorySize();
std::size_t mappedSize = static_cast<std::size_t>(params->getLanes())
* ARGON2_BLOCK_SIZE;
std::size_t mappedOffset = memorySize * (jobId + 1) - mappedSize;
return queue.enqueueMapBuffer(memoryBuffer, true, CL_MAP_READ,
mappedOffset, mappedSize);
}
void KernelRunner::unmapOutputMemory(void *memory)
{
queue.enqueueUnmapMemObject(memoryBuffer, memory);
std::size_t jobSize = params->getMemorySize();
std::size_t copySize = params->getLanes() * ARGON2_BLOCK_SIZE;
queue.enqueueReadBufferRect(memoryBuffer, false,
makeSize3(jobSize - copySize, 0, 0),
makeSize3(0, 0, 0),
makeSize3(copySize, batchSize, 1),
jobSize, 0, copySize, 0, blocksOut.get());
}
void KernelRunner::run(std::uint32_t lanesPerBlock, std::uint32_t jobsPerBlock)
......@@ -162,6 +176,10 @@ void KernelRunner::run(std::uint32_t lanesPerBlock, std::uint32_t jobsPerBlock)
queue.enqueueMarker(&start);
copyInputBlocks();
queue.enqueueMarker(&kernelStart);
std::size_t shmemSize = THREADS_PER_LANE * lanesPerBlock * jobsPerBlock
* sizeof(cl_uint) * 2;
kernel.setArg<cl::LocalSpaceArg>(0, { shmemSize });
......@@ -179,6 +197,10 @@ void KernelRunner::run(std::uint32_t lanesPerBlock, std::uint32_t jobsPerBlock)
globalRange, localRange);
}
queue.enqueueMarker(&kernelEnd);
copyOutputBlocks();
queue.enqueueMarker(&end);
}
......@@ -186,10 +208,15 @@ float KernelRunner::finish()
{
end.wait();
cl_ulong nsStart = start.getProfilingInfo<CL_PROFILING_COMMAND_END>();
cl_ulong nsEnd = end.getProfilingInfo<CL_PROFILING_COMMAND_END>();
#ifndef NDEBUG
std::cerr << "[INFO] Copy to device took "
<< getDurationInMs(start, kernelStart) << " ms." << std::endl;
return (nsEnd - nsStart) / (1000.0F * 1000.0F);
std::cerr << "[INFO] Copy from device took "
<< getDurationInMs(kernelEnd, end) << " ms." << std::endl;
#endif
return getDurationInMs(start, end);
}
} // namespace opencl
......
......@@ -110,18 +110,16 @@ ProcessingUnit::ProcessingUnit(
void ProcessingUnit::setPassword(std::size_t index, const void *pw,
std::size_t pwSize)
{
void *memory = runner.mapInputMemory(index);
void *memory = runner.getInputMemory(index);
params->fillFirstBlocks(memory, pw, pwSize,
programContext->getArgon2Type(),
programContext->getArgon2Version());
runner.unmapInputMemory(memory);
}
void ProcessingUnit::getHash(std::size_t index, void *hash)
{
void *memory = runner.mapOutputMemory(index);
const void *memory = runner.getOutputMemory(index);
params->finalize(hash, memory);
runner.unmapOutputMemory(memory);
}
void ProcessingUnit::beginProcessing()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment