Commit ebed6977 authored by Ondrej Mosnáček's avatar Ondrej Mosnáček

Refactor internal interface for CUDA code

This commit also adds run-time benchmarking to determine the best block size.
parent 5e47f4f4
......@@ -27,6 +27,14 @@ target_include_directories(argon2-gpu-common PRIVATE
lib/argon2-gpu-common
)
# HACK because CMake is a piece of crap (remove this when we can depend on v3.7 and above):
cuda_include_directories(
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>
${CMAKE_CURRENT_SOURCE_DIR}/include/argon2-cuda
${CMAKE_CURRENT_SOURCE_DIR}/lib/argon2-cuda
)
cuda_add_library(argon2-cuda SHARED
lib/argon2-cuda/device.cpp
lib/argon2-cuda/globalcontext.cpp
......
#ifndef ARGON2_CUDA_KERNELS_H
#define ARGON2_CUDA_KERNELS_H
#include <cuda_runtime.h>
#include <cstdint>
/* workaround weird CMake/CUDA bug: */
#ifdef argon2
#undef argon2
#endif
namespace argon2 {
namespace cuda {
class Argon2KernelRunner
{
private:
std::uint32_t type, version;
std::uint32_t passes, lanes, segmentBlocks;
std::uint32_t batchSize;
bool bySegment;
cudaEvent_t start, end;
cudaStream_t stream;
void *memory;
void runKernelSegment(std::uint32_t blockSize,
std::uint32_t pass, std::uint32_t slice);
void runKernelOneshot(std::uint32_t blockSize);
static uint32_t checkPowerOf2(uint32_t v)
{
return (v & (v - 1)) == 0 ? v : 1;
}
public:
std::uint32_t getMaxBlockSize() const
{
return checkPowerOf2(bySegment ? lanes : batchSize);
}
std::uint32_t getBatchSize() const { return batchSize; }
void *getMemory() const { return memory; }
Argon2KernelRunner(std::uint32_t type, std::uint32_t version,
std::uint32_t passes, std::uint32_t lanes,
std::uint32_t segmentBlocks, std::uint32_t batchSize,
bool bySegment);
~Argon2KernelRunner();
void run(std::uint32_t blockSize);
float finish();
};
} // cuda
} // argon2
#endif // ARGON2_CUDA_KERNELS_H
......@@ -4,6 +4,7 @@
#include <memory>
#include "programcontext.h"
#include "kernels.h"
#include "argon2-gpu-common/argon2params.h"
namespace argon2 {
......@@ -16,13 +17,8 @@ private:
const Argon2Params *params;
const Device *device;
std::size_t batchSize;
std::size_t memorySize;
bool bySegment;
cudaStream_t stream;
void *memoryBuffer;
Argon2KernelRunner runner;
uint32_t bestBlockSize;
public:
class PasswordWriter
......@@ -58,13 +54,12 @@ public:
const void *getHash() const;
};
std::size_t getBatchSize() const { return batchSize; }
std::size_t getBatchSize() const { return runner.getBatchSize(); }
ProcessingUnit(
const ProgramContext *programContext, const Argon2Params *params,
const Device *device, std::size_t batchSize,
bool bySegment = true);
~ProcessingUnit();
void beginProcessing();
void endProcessing();
......
......@@ -4,6 +4,9 @@
#endif
#include "kernels.h"
#include "cudaexception.h"
#include <stdexcept>
#define ARGON2_D 0
#define ARGON2_I 1
......@@ -288,21 +291,28 @@ __global__ void argon2_kernel_segment(
struct block_g *memory, uint32_t passes, uint32_t lanes,
uint32_t segment_blocks, uint32_t pass, uint32_t slice)
{
extern __shared__ struct block_l shared_mem[];
struct block_l *shared = shared_mem;
uint32_t job_id = blockIdx.z;
uint32_t lane = blockIdx.y;
uint32_t lane = blockIdx.y * blockDim.y + threadIdx.y;
uint32_t thread = threadIdx.x;
uint32_t lane_blocks = ARGON2_SYNC_POINTS * segment_blocks;
/* select job's memory region: */
memory += job_id * lanes * lane_blocks;
/* select warp's shared memory buffer: */
shared += threadIdx.y * (type == ARGON2_I ? 3 : 2);
uint32_t thread_input;
__shared__ struct block_l local_curr, local_prev, local_addr;
struct block_l *curr = &local_curr;
struct block_l *prev = &local_prev;
struct block_l *curr = &shared[0];
struct block_l *prev = &shared[1];
struct block_l *addr;
if (type == ARGON2_I) {
addr = &shared[2];
switch (thread) {
case 0:
thread_input = pass;
......@@ -331,7 +341,7 @@ __global__ void argon2_kernel_segment(
if (thread == 6) {
++thread_input;
}
next_addresses(thread, &local_addr, curr, thread_input);
next_addresses(thread, addr, curr, thread_input);
}
}
......@@ -362,7 +372,7 @@ __global__ void argon2_kernel_segment(
for (uint32_t offset = start_offset; offset < segment_blocks; ++offset) {
argon2_core<type, version>(
memory, mem_curr, curr, prev, &local_addr,
memory, mem_curr, curr, prev, addr,
lanes, segment_blocks, lane_blocks,
thread, &thread_input,
lane, pass, slice, offset);
......@@ -379,7 +389,7 @@ __global__ void argon2_kernel_oneshot(
extern __shared__ struct block_l shared_mem[];
struct block_l *shared = shared_mem;
uint32_t job_id = blockIdx.z;
uint32_t job_id = blockIdx.z * blockDim.z + threadIdx.z;
uint32_t lane = threadIdx.y;
uint32_t thread = threadIdx.x;
......@@ -475,68 +485,143 @@ __global__ void argon2_kernel_oneshot(
}
}
void argon2_run_kernel_segment(
uint32_t type, uint32_t version, uint32_t batchSize,
cudaStream_t stream, void *memory, uint32_t passes, uint32_t lanes,
uint32_t segment_blocks, uint32_t pass, uint32_t slice)
Argon2KernelRunner::Argon2KernelRunner(
uint32_t type, uint32_t version, uint32_t passes, uint32_t lanes,
uint32_t segmentBlocks, uint32_t batchSize, bool bySegment)
: type(type), version(version), passes(passes), lanes(lanes),
segmentBlocks(segmentBlocks), batchSize(batchSize), bySegment(bySegment),
stream(nullptr), memory(nullptr), start(nullptr), end(nullptr)
{
// FIXME: check overflow:
uint32_t memorySize = lanes * segmentBlocks * ARGON2_SYNC_POINTS
* ARGON2_BLOCK_SIZE * batchSize;
CudaException::check(cudaMallocManaged(&memory, memorySize,
cudaMemAttachHost));
CudaException::check(cudaEventCreate(&start));
CudaException::check(cudaEventCreate(&end));
CudaException::check(cudaStreamCreate(&stream));
CudaException::check(cudaStreamAttachMemAsync(stream, memory));
CudaException::check(cudaStreamSynchronize(stream));
}
Argon2KernelRunner::~Argon2KernelRunner()
{
if (start != nullptr) {
cudaEventDestroy(start);
}
if (end != nullptr) {
cudaEventDestroy(end);
}
if (stream != nullptr) {
cudaStreamDestroy(stream);
}
if (memory != nullptr) {
cudaFree(memory);
}
}
void Argon2KernelRunner::runKernelSegment(uint32_t blockSize,
uint32_t pass, uint32_t slice)
{
if (blockSize > lanes || lanes % blockSize != 0) {
throw std::logic_error("Invalid blockSize!");
}
struct block_g *memory_blocks = (struct block_g *)memory;
dim3 blocks = dim3(1, lanes, batchSize);
dim3 threads = dim3(THREADS_PER_LANE);
dim3 blocks = dim3(1, lanes / blockSize, batchSize);
dim3 threads = dim3(THREADS_PER_LANE, blockSize);
if (type == ARGON2_I) {
uint32_t shared_size = blockSize * ARGON2_BLOCK_SIZE * 3;
if (version == ARGON2_VERSION_10) {
argon2_kernel_segment<ARGON2_I, ARGON2_VERSION_10>
<<<blocks, threads, 0, stream>>>(memory_blocks, passes, lanes,
segment_blocks, pass, slice);
<<<blocks, threads, shared_size, stream>>>(
memory_blocks, passes, lanes, segmentBlocks,
pass, slice);
} else {
argon2_kernel_segment<ARGON2_I, ARGON2_VERSION_13>
<<<blocks, threads, 0, stream>>>(memory_blocks, passes, lanes,
segment_blocks, pass, slice);
<<<blocks, threads, shared_size, stream>>>(
memory_blocks, passes, lanes, segmentBlocks,
pass, slice);
}
} else {
uint32_t shared_size = blockSize * ARGON2_BLOCK_SIZE * 2;
if (version == ARGON2_VERSION_10) {
argon2_kernel_segment<ARGON2_D, ARGON2_VERSION_10>
<<<blocks, threads, 0, stream>>>(memory_blocks, passes, lanes,
segment_blocks, pass, slice);
<<<blocks, threads, shared_size, stream>>>(
memory_blocks, passes, lanes, segmentBlocks,
pass, slice);
} else {
argon2_kernel_segment<ARGON2_D, ARGON2_VERSION_13>
<<<blocks, threads, 0, stream>>>(memory_blocks, passes, lanes,
segment_blocks, pass, slice);
<<<blocks, threads, shared_size, stream>>>(
memory_blocks, passes, lanes, segmentBlocks,
pass, slice);
}
}
}
void argon2_run_kernel_oneshot(
uint32_t type, uint32_t version, uint32_t batchSize,
cudaStream_t stream, void *memory, uint32_t passes, uint32_t lanes,
uint32_t segment_blocks)
void Argon2KernelRunner::runKernelOneshot(uint32_t blockSize)
{
if (blockSize > batchSize || batchSize % blockSize != 0) {
throw std::logic_error("Invalid blockSize!");
}
struct block_g *memory_blocks = (struct block_g *)memory;
dim3 blocks = dim3(1, 1, batchSize);
dim3 threads = dim3(THREADS_PER_LANE, lanes);
dim3 blocks = dim3(1, 1, batchSize / blockSize);
dim3 threads = dim3(THREADS_PER_LANE, lanes, blockSize);
if (type == ARGON2_I) {
uint32_t shared_size = lanes * ARGON2_BLOCK_SIZE * 3;
if (version == ARGON2_VERSION_10) {
argon2_kernel_oneshot<ARGON2_I, ARGON2_VERSION_10>
<<<blocks, threads, shared_size, stream>>>(
memory_blocks, passes, lanes, segment_blocks);
memory_blocks, passes, lanes, segmentBlocks);
} else {
argon2_kernel_oneshot<ARGON2_I, ARGON2_VERSION_13>
<<<blocks, threads, shared_size, stream>>>(
memory_blocks, passes, lanes, segment_blocks);
memory_blocks, passes, lanes, segmentBlocks);
}
} else {
uint32_t shared_size = lanes * ARGON2_BLOCK_SIZE * 2;
if (version == ARGON2_VERSION_10) {
argon2_kernel_oneshot<ARGON2_D, ARGON2_VERSION_10>
<<<blocks, threads, shared_size, stream>>>(
memory_blocks, passes, lanes, segment_blocks);
memory_blocks, passes, lanes, segmentBlocks);
} else {
argon2_kernel_oneshot<ARGON2_D, ARGON2_VERSION_13>
<<<blocks, threads, shared_size, stream>>>(
memory_blocks, passes, lanes, segment_blocks);
memory_blocks, passes, lanes, segmentBlocks);
}
}
}
void Argon2KernelRunner::run(uint32_t blockSize)
{
CudaException::check(cudaEventRecord(start, stream));
if (bySegment) {
for (uint32_t pass = 0; pass < passes; pass++) {
for (uint32_t slice = 0; slice < ARGON2_SYNC_POINTS; slice++) {
runKernelSegment(blockSize, pass, slice);
}
}
} else {
runKernelOneshot(blockSize);
}
CudaException::check(cudaGetLastError());
CudaException::check(cudaEventRecord(end, stream));
}
float Argon2KernelRunner::finish()
{
CudaException::check(cudaStreamSynchronize(stream));
float time = 0.0;
CudaException::check(cudaEventElapsedTime(&time, start, end));
return time;
}
} // cuda
......
#ifndef ARGON2_CUDA_KERNELS_H
#define ARGON2_CUDA_KERNELS_H
#include <cuda_runtime.h>
#include <cstdint>
/* workaround weird CMake/CUDA bug: */
#ifdef argon2
#undef argon2
#endif
namespace argon2 {
namespace cuda {
void argon2_run_kernel_segment(
std::uint32_t type, std::uint32_t version, std::uint32_t batchSize,
cudaStream_t stream, void *memory, std::uint32_t passes,
std::uint32_t lanes, std::uint32_t segment_blocks, std::uint32_t pass,
std::uint32_t slice);
void argon2_run_kernel_oneshot(
std::uint32_t type, std::uint32_t version, std::uint32_t batchSize,
cudaStream_t stream, void *memory, std::uint32_t passes,
std::uint32_t lanes, std::uint32_t segment_blocks);
} // cuda
} // argon2
#endif // ARGON2_CUDA_KERNELS_H
......@@ -3,36 +3,60 @@
#include "cudaexception.h"
#include "kernels.h"
#include <limits>
#ifndef NDEBUG
#include <iostream>
#endif
namespace argon2 {
namespace cuda {
ProcessingUnit::ProcessingUnit(
const ProgramContext *programContext, const Argon2Params *params,
const Device *device, std::size_t batchSize,
bool bySegment)
: programContext(programContext), params(params),
device(device), batchSize(batchSize), bySegment(bySegment),
stream(nullptr), memoryBuffer(nullptr)
const Device *device, std::size_t batchSize, bool bySegment)
: programContext(programContext), params(params), device(device),
runner(programContext->getArgon2Type(),
programContext->getArgon2Version(), params->getTimeCost(),
params->getLanes(), params->getSegmentBlocks(), batchSize,
bySegment),
bestBlockSize(1)
{
// FIXME: check memSize out of bounds
CudaException::check(cudaStreamCreate(&stream));
memorySize = params->getMemorySize() * batchSize;
CudaException::check(cudaMallocManaged(&memoryBuffer, memorySize,
cudaMemAttachHost));
CudaException::check(cudaSetDevice(device->getDeviceIndex()));
if (runner.getMaxBlockSize() > 1) {
#ifndef NDEBUG
std::cerr << "[INFO] Benchmarking block size..." << std::endl;
#endif
float bestTime = std::numeric_limits<float>::infinity();
for (std::uint32_t blockSize = 1; blockSize <= runner.getMaxBlockSize();
blockSize *= 2)
{
float time;
try {
runner.run(blockSize);
time = runner.finish();
} catch(CudaException &ex) {
#ifndef NDEBUG
std::cerr << "[WARN] Exception on block size " << blockSize
<< ": " << ex.what() << std::endl;
#endif
break;
}
CudaException::check(cudaStreamAttachMemAsync(stream, memoryBuffer));
CudaException::check(cudaStreamSynchronize(stream));
}
#ifndef NDEBUG
std::cerr << "[INFO] Block size " << blockSize << ": "
<< time << " ms" << std::endl;
#endif
ProcessingUnit::~ProcessingUnit()
{
if (stream != nullptr) {
cudaStreamDestroy(stream);
if (time < bestTime) {
bestTime = time;
bestBlockSize = blockSize;
}
}
if (memoryBuffer != nullptr) {
cudaFree(memoryBuffer);
#ifndef NDEBUG
std::cerr << "[INFO] Picked block size: " << bestBlockSize << std::endl;
#endif
}
}
......@@ -41,7 +65,7 @@ ProcessingUnit::PasswordWriter::PasswordWriter(
: params(parent.params),
type(parent.programContext->getArgon2Type()),
version(parent.programContext->getArgon2Version()),
dest(static_cast<std::uint8_t *>(parent.memoryBuffer))
dest(static_cast<std::uint8_t *>(parent.runner.getMemory()))
{
dest += index * params->getMemorySize();
}
......@@ -65,7 +89,7 @@ void ProcessingUnit::PasswordWriter::setPassword(
ProcessingUnit::HashReader::HashReader(
ProcessingUnit &parent, std::size_t index)
: params(parent.params),
src(static_cast<const std::uint8_t *>(parent.memoryBuffer)),
src(static_cast<const std::uint8_t *>(parent.runner.getMemory())),
buffer(new std::uint8_t[params->getOutputLength()])
{
src += index * params->getMemorySize();
......@@ -89,33 +113,13 @@ const void *ProcessingUnit::HashReader::getHash() const
void ProcessingUnit::beginProcessing()
{
if (bySegment) {
for (unsigned int pass = 0; pass < params->getTimeCost(); pass++) {
for (unsigned int slice = 0; slice < ARGON2_SYNC_POINTS; slice++) {
argon2_run_kernel_segment(
programContext->getArgon2Type(),
programContext->getArgon2Version(),
batchSize, stream, (unsigned long *)memoryBuffer,
params->getTimeCost(),
params->getLanes(),
params->getSegmentBlocks(),
pass, slice);
}
}
} else {
argon2_run_kernel_oneshot(
programContext->getArgon2Type(),
programContext->getArgon2Version(),
batchSize, stream, (unsigned long *)memoryBuffer,
params->getTimeCost(),
params->getLanes(),
params->getSegmentBlocks());
}
CudaException::check(cudaSetDevice(device->getDeviceIndex()));
runner.run(bestBlockSize);
}
void ProcessingUnit::endProcessing()
{
CudaException::check(cudaStreamSynchronize(stream));
runner.finish();
}
} // namespace cuda
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment