Commit 03e96e0d authored by Ondrej Mosnáček's avatar Ondrej Mosnáček

Update OpenCL backend to match the CUDA kernel

parent 5ce99e0d
......@@ -59,6 +59,7 @@ add_library(argon2-opencl SHARED
lib/argon2-opencl/kernelloader.cpp
lib/argon2-opencl/programcontext.cpp
lib/argon2-opencl/processingunit.cpp
lib/argon2-opencl/kernelrunner.cpp
)
target_include_directories(argon2-opencl INTERFACE
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
......@@ -108,6 +109,7 @@ install(FILES
include/argon2-opencl/globalcontext.h
include/argon2-opencl/programcontext.h
include/argon2-opencl/processingunit.h
include/argon2-opencl/kernelrunner.h
include/argon2-cuda/cudaexception.h
include/argon2-cuda/kernels.h
include/argon2-cuda/device.h
......
This diff is collapsed.
#ifndef ARGON2_OPENCL_KERNELRUNNER_H
#define ARGON2_OPENCL_KERNELRUNNER_H
#include "programcontext.h"
#include "argon2-gpu-common/argon2params.h"
namespace argon2 {
namespace opencl {
class KernelRunner
{
private:
const ProgramContext *programContext;
const Argon2Params *params;
std::uint32_t batchSize;
bool bySegment;
bool precompute;
cl::CommandQueue queue;
cl::Kernel kernel;
cl::Buffer memoryBuffer, refsBuffer;
cl::Event start, end;
void *memory;
std::uint32_t memorySize;
void precomputeRefs();
public:
std::uint32_t getMinLanesPerBlock() const
{
return bySegment ? 1 : params->getLanes();
}
std::uint32_t getMaxLanesPerBlock() const { return params->getLanes(); }
std::uint32_t getMinJobsPerBlock() const { return 1; }
std::uint32_t getMaxJobsPerBlock() const { return batchSize; }
std::uint32_t getBatchSize() const { return batchSize; }
void *getMemory() const { return memory; }
KernelRunner(const ProgramContext *programContext,
const Argon2Params *params, const Device *device,
std::uint32_t batchSize, bool bySegment, bool precompute);
void run(std::uint32_t lanesPerBlock, std::uint32_t jobsPerBlock);
float finish();
};
} // namespace opencl
} // namespace argon2
#endif // ARGON2_OPENCL_KERNELRUNNER_H
......@@ -3,8 +3,7 @@
#include <memory>
#include "programcontext.h"
#include "argon2-gpu-common/argon2params.h"
#include "kernelrunner.h"
namespace argon2 {
namespace opencl {
......@@ -16,19 +15,9 @@ private:
const Argon2Params *params;
const Device *device;
std::size_t batchSize;
std::size_t memorySize;
bool bySegment;
cl::CommandQueue cmdQueue;
cl::Buffer memoryBuffer;
cl::Buffer debugBuffer;
void *mappedMemoryBuffer;
cl::Kernel kernel;
cl::Event event;
KernelRunner runner;
std::uint32_t bestLanesPerBlock;
std::uint32_t bestJobsPerBlock;
public:
class PasswordWriter
......@@ -64,7 +53,7 @@ public:
const void *getHash() const;
};
std::size_t getBatchSize() const { return batchSize; }
std::size_t getBatchSize() const { return runner.getBatchSize(); }
ProcessingUnit(
const ProgramContext *programContext, const Argon2Params *params,
......
#include "kernelrunner.h"
#include <stdexcept>
#ifndef NDEBUG
#include <iostream>
#endif
#define THREADS_PER_LANE 32
namespace argon2 {
namespace opencl {
enum {
ARGON2_REFS_PER_BLOCK = ARGON2_BLOCK_SIZE / (2 * sizeof(cl_uint)),
};
KernelRunner::KernelRunner(const ProgramContext *programContext,
const Argon2Params *params, const Device *device,
std::uint32_t batchSize, bool bySegment, bool precompute)
: programContext(programContext), params(params), batchSize(batchSize),
bySegment(bySegment), precompute(precompute),
memorySize(params->getMemorySize() * batchSize)
{
auto context = programContext->getContext();
std::uint32_t passes = params->getTimeCost();
std::uint32_t lanes = params->getLanes();
std::uint32_t segmentBlocks = params->getSegmentBlocks();
queue = cl::CommandQueue(context, device->getCLDevice(),
CL_QUEUE_PROFILING_ENABLE);
memoryBuffer = cl::Buffer(context, CL_MEM_READ_WRITE, memorySize);
memory = queue.enqueueMapBuffer(memoryBuffer, true, CL_MAP_WRITE, 0,
memorySize);
Type type = programContext->getArgon2Type();
if ((type == ARGON2_I || type == ARGON2_ID) && precompute) {
uint32_t segments =
type == ARGON2_ID
? lanes * (ARGON2_SYNC_POINTS / 2)
: passes * lanes * ARGON2_SYNC_POINTS;
std::uint32_t refsSize = segments * segmentBlocks * sizeof(cl_uint) * 2;
#ifndef NDEBUG
std::cerr << "[INFO] Allocating " << refsSize << " bytes for refs..."
<< std::endl;
#endif
refsBuffer = cl::Buffer(context, CL_MEM_READ_ONLY, refsSize);
precomputeRefs();
}
static const char *KERNEL_NAMES[2][2] = {
{
"argon2_kernel_oneshot",
"argon2_kernel_segment",
},
{
"argon2_kernel_oneshot_precompute",
"argon2_kernel_segment_precompute",
}
};
kernel = cl::Kernel(programContext->getProgram(),
KERNEL_NAMES[precompute][bySegment]);
kernel.setArg<cl::Buffer>(1, memoryBuffer);
if (precompute) {
kernel.setArg<cl::Buffer>(2, refsBuffer);
kernel.setArg<cl_uint>(3, passes);
kernel.setArg<cl_uint>(4, lanes);
kernel.setArg<cl_uint>(5, segmentBlocks);
} else {
kernel.setArg<cl_uint>(2, passes);
kernel.setArg<cl_uint>(3, lanes);
kernel.setArg<cl_uint>(4, segmentBlocks);
}
}
void KernelRunner::precomputeRefs()
{
std::uint32_t passes = params->getTimeCost();
std::uint32_t lanes = params->getLanes();
std::uint32_t segmentBlocks = params->getSegmentBlocks();
std::uint32_t segmentAddrBlocks =
(segmentBlocks + ARGON2_REFS_PER_BLOCK - 1)
/ ARGON2_REFS_PER_BLOCK;
std::uint32_t segments = programContext->getArgon2Type() == ARGON2_ID
? lanes * (ARGON2_SYNC_POINTS / 2)
: passes * lanes * ARGON2_SYNC_POINTS;
std::uint32_t shmemSize = THREADS_PER_LANE * sizeof(cl_uint) * 2;
cl::Kernel kernel = cl::Kernel(programContext->getProgram(),
"argon2_precompute_kernel");
kernel.setArg<cl::LocalSpaceArg>(0, { shmemSize });
kernel.setArg<cl::Buffer>(1, refsBuffer);
kernel.setArg<cl_uint>(2, passes);
kernel.setArg<cl_uint>(3, lanes);
kernel.setArg<cl_uint>(4, segmentBlocks);
cl::NDRange globalRange { THREADS_PER_LANE * segments * segmentAddrBlocks };
cl::NDRange localRange { THREADS_PER_LANE };
queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalRange, localRange);
queue.finish();
}
void KernelRunner::run(std::uint32_t lanesPerBlock, std::uint32_t jobsPerBlock)
{
std::uint32_t lanes = params->getLanes();
std::uint32_t passes = params->getTimeCost();
if (bySegment) {
if (lanesPerBlock > lanes || lanes % lanesPerBlock != 0) {
throw std::logic_error("Invalid lanesPerBlock!");
}
} else {
if (lanesPerBlock != lanes) {
throw std::logic_error("Invalid lanesPerBlock!");
}
}
if (jobsPerBlock > batchSize || batchSize % jobsPerBlock != 0) {
throw std::logic_error("Invalid jobsPerBlock!");
}
cl::NDRange globalRange { THREADS_PER_LANE * lanes, batchSize };
cl::NDRange localRange { THREADS_PER_LANE * lanesPerBlock, jobsPerBlock };
// FIXME: map only necessary parts of memory buffer
queue.enqueueUnmapMemObject(memoryBuffer, memory, nullptr, &start);
try {
std::uint32_t shmemSize =
THREADS_PER_LANE * lanesPerBlock * jobsPerBlock *
sizeof(cl_uint) * 2;
kernel.setArg<cl::LocalSpaceArg>(0, { shmemSize });
if (bySegment) {
for (std::uint32_t pass = 0; pass < passes; pass++) {
for (std::uint32_t slice = 0; slice < ARGON2_SYNC_POINTS;
slice++) {
kernel.setArg<cl_uint>(precompute ? 6 : 5, pass);
kernel.setArg<cl_uint>(precompute ? 7 : 6, slice);
queue.enqueueNDRangeKernel(kernel, cl::NullRange,
globalRange, localRange);
}
}
} else {
queue.enqueueNDRangeKernel(kernel, cl::NullRange,
globalRange, localRange);
}
} catch (const cl::Error &err) {
memory = queue.enqueueMapBuffer(
memoryBuffer, true, CL_MAP_READ | CL_MAP_WRITE,
0, memorySize);
throw err;
}
memory = queue.enqueueMapBuffer(
memoryBuffer, false, CL_MAP_READ | CL_MAP_WRITE,
0, memorySize, nullptr, &end);
}
float KernelRunner::finish()
{
end.wait();
cl_ulong nsStart = start.getProfilingInfo<CL_PROFILING_COMMAND_START>();
cl_ulong nsEnd = end.getProfilingInfo<CL_PROFILING_COMMAND_END>();
return (nsEnd - nsStart) / (1000.0F * 1000.0F);
}
} // namespace opencl
} // namespace argon2
#include "processingunit.h"
#define THREADS_PER_LANE 32
#define DEBUG_BUFFER_SIZE 4
#include <limits>
#ifndef NDEBUG
#include <iostream>
#endif
namespace argon2 {
namespace opencl {
......@@ -10,44 +12,96 @@ ProcessingUnit::ProcessingUnit(
const ProgramContext *programContext, const Argon2Params *params,
const Device *device, std::size_t batchSize,
bool bySegment, bool precomputeRefs)
: programContext(programContext), params(params),
device(device), batchSize(batchSize), bySegment(bySegment)
: programContext(programContext), params(params), device(device),
runner(programContext, params, device, batchSize, bySegment,
precomputeRefs),
bestLanesPerBlock(runner.getMinLanesPerBlock()),
bestJobsPerBlock(runner.getMinJobsPerBlock())
{
// TODO: implement precomputeRefs
// FIXME: check memSize out of bounds
auto &clContext = programContext->getContext();
auto lanes = params->getLanes();
cmdQueue = cl::CommandQueue(clContext, device->getCLDevice());
memorySize = params->getMemorySize() * batchSize;
memoryBuffer = cl::Buffer(clContext, CL_MEM_READ_WRITE, memorySize);
debugBuffer = cl::Buffer(clContext, CL_MEM_WRITE_ONLY, DEBUG_BUFFER_SIZE);
mappedMemoryBuffer = cmdQueue.enqueueMapBuffer(
memoryBuffer, true, CL_MAP_WRITE, 0, memorySize);
if (bySegment) {
kernel = cl::Kernel(programContext->getProgram(),
"argon2_kernel_segment");
kernel.setArg<cl::Buffer>(0, memoryBuffer);
kernel.setArg<cl_uint>(1, params->getTimeCost());
kernel.setArg<cl_uint>(2, lanes);
kernel.setArg<cl_uint>(3, params->getSegmentBlocks());
} else {
auto localMemSize = (std::size_t)lanes * ARGON2_BLOCK_SIZE;
if (programContext->getArgon2Type() != ARGON2_D) {
localMemSize *= 3;
} else {
localMemSize *= 2;
auto memory = static_cast<std::uint8_t *>(runner.getMemory());
/* pre-fill first blocks with pseudo-random data: */
for (std::size_t i = 0; i < batchSize; i++) {
params->fillFirstBlocks(memory, NULL, 0,
programContext->getArgon2Type(),
programContext->getArgon2Version());
memory += params->getMemorySize();
}
if (runner.getMaxLanesPerBlock() > runner.getMinLanesPerBlock()) {
#ifndef NDEBUG
std::cerr << "[INFO] Tuning lanes per block..." << std::endl;
#endif
float bestTime = std::numeric_limits<float>::infinity();
for (std::uint32_t lpb = 1; lpb <= runner.getMaxLanesPerBlock();
lpb *= 2)
{
float time;
try {
runner.run(lpb, bestJobsPerBlock);
time = runner.finish();
} catch(cl::Error &ex) {
#ifndef NDEBUG
std::cerr << "[WARN] OpenCL error on " << lpb
<< " lanes per block: " << ex.what() << std::endl;
#endif
break;
}
#ifndef NDEBUG
std::cerr << "[INFO] " << lpb << " lanes per block: "
<< time << " ms" << std::endl;
#endif
if (time < bestTime) {
bestTime = time;
bestLanesPerBlock = lpb;
}
}
#ifndef NDEBUG
std::cerr << "[INFO] Picked " << bestLanesPerBlock
<< " lanes per block." << std::endl;
#endif
}
/* Only tune jobs per block if we hit maximum lanes per block: */
if (bestLanesPerBlock == runner.getMaxLanesPerBlock()
&& runner.getMaxJobsPerBlock() > runner.getMinJobsPerBlock()) {
#ifndef NDEBUG
std::cerr << "[INFO] Tuning jobs per block..." << std::endl;
#endif
float bestTime = std::numeric_limits<float>::infinity();
for (std::uint32_t jpb = 1; jpb <= runner.getMaxJobsPerBlock();
jpb *= 2)
{
float time;
try {
runner.run(bestLanesPerBlock, jpb);
time = runner.finish();
} catch(cl::Error &ex) {
#ifndef NDEBUG
std::cerr << "[WARN] OpenCL error on " << jpb
<< " jobs per block: " << ex.what() << std::endl;
#endif
break;
}
#ifndef NDEBUG
std::cerr << "[INFO] " << jpb << " jobs per block: "
<< time << " ms" << std::endl;
#endif
kernel = cl::Kernel(programContext->getProgram(),
"argon2_kernel_oneshot");
kernel.setArg<cl::Buffer>(0, memoryBuffer);
kernel.setArg<cl::LocalSpaceArg>(1, { localMemSize });
kernel.setArg<cl_uint>(2, params->getTimeCost());
kernel.setArg<cl_uint>(3, lanes);
kernel.setArg<cl_uint>(4, params->getSegmentBlocks());
if (time < bestTime) {
bestTime = time;
bestJobsPerBlock = jpb;
}
}
#ifndef NDEBUG
std::cerr << "[INFO] Picked " << bestJobsPerBlock
<< " jobs per block." << std::endl;
#endif
}
}
......@@ -56,7 +110,7 @@ ProcessingUnit::PasswordWriter::PasswordWriter(
: params(parent.params),
type(parent.programContext->getArgon2Type()),
version(parent.programContext->getArgon2Version()),
dest(static_cast<std::uint8_t *>(parent.mappedMemoryBuffer))
dest(static_cast<std::uint8_t *>(parent.runner.getMemory()))
{
dest += index * params->getMemorySize();
}
......@@ -80,7 +134,7 @@ void ProcessingUnit::PasswordWriter::setPassword(
ProcessingUnit::HashReader::HashReader(
ProcessingUnit &parent, std::size_t index)
: params(parent.params),
src(static_cast<const std::uint8_t *>(parent.mappedMemoryBuffer)),
src(static_cast<const std::uint8_t *>(parent.runner.getMemory())),
buffer(new std::uint8_t[params->getOutputLength()])
{
src += index * params->getMemorySize();
......@@ -104,37 +158,12 @@ const void *ProcessingUnit::HashReader::getHash() const
void ProcessingUnit::beginProcessing()
{
cmdQueue.enqueueUnmapMemObject(memoryBuffer, mappedMemoryBuffer);
if (bySegment) {
for (cl_uint pass = 0; pass < params->getTimeCost(); pass++) {
kernel.setArg<cl_uint>(4, pass);
for (cl_uint slice = 0; slice < ARGON2_SYNC_POINTS; slice++) {
kernel.setArg<cl_uint>(5, slice);
cmdQueue.enqueueNDRangeKernel(
kernel, cl::NullRange,
cl::NDRange(THREADS_PER_LANE, params->getLanes(),
batchSize),
cl::NDRange(THREADS_PER_LANE, 1, 1));
}
}
} else {
cmdQueue.enqueueNDRangeKernel(
kernel, cl::NullRange,
cl::NDRange(THREADS_PER_LANE, params->getLanes(),
batchSize),
cl::NDRange(THREADS_PER_LANE, params->getLanes(), 1));
}
mappedMemoryBuffer = cmdQueue.enqueueMapBuffer(
memoryBuffer, false, CL_MAP_READ | CL_MAP_WRITE,
0, memorySize, nullptr, &event);
runner.run(bestLanesPerBlock, bestJobsPerBlock);
}
void ProcessingUnit::endProcessing()
{
event.wait();
event = cl::Event();
runner.finish();
}
} // namespace opencl
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment