Commit ebed6977 authored by Ondrej Mosnáček's avatar Ondrej Mosnáček

Refactor internal interface for CUDA code

This commit also adds run-time benchmarking to determine the best block size.
parent 5e47f4f4
...@@ -27,6 +27,14 @@ target_include_directories(argon2-gpu-common PRIVATE ...@@ -27,6 +27,14 @@ target_include_directories(argon2-gpu-common PRIVATE
lib/argon2-gpu-common lib/argon2-gpu-common
) )
# HACK because CMake is a piece of crap (remove this when we can depend on v3.7 and above):
cuda_include_directories(
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>
${CMAKE_CURRENT_SOURCE_DIR}/include/argon2-cuda
${CMAKE_CURRENT_SOURCE_DIR}/lib/argon2-cuda
)
cuda_add_library(argon2-cuda SHARED cuda_add_library(argon2-cuda SHARED
lib/argon2-cuda/device.cpp lib/argon2-cuda/device.cpp
lib/argon2-cuda/globalcontext.cpp lib/argon2-cuda/globalcontext.cpp
......
#ifndef ARGON2_CUDA_KERNELS_H
#define ARGON2_CUDA_KERNELS_H
#include <cuda_runtime.h>
#include <cstdint>
/* workaround weird CMake/CUDA bug: */
#ifdef argon2
#undef argon2
#endif
namespace argon2 {
namespace cuda {
class Argon2KernelRunner
{
private:
std::uint32_t type, version;
std::uint32_t passes, lanes, segmentBlocks;
std::uint32_t batchSize;
bool bySegment;
cudaEvent_t start, end;
cudaStream_t stream;
void *memory;
void runKernelSegment(std::uint32_t blockSize,
std::uint32_t pass, std::uint32_t slice);
void runKernelOneshot(std::uint32_t blockSize);
static uint32_t checkPowerOf2(uint32_t v)
{
return (v & (v - 1)) == 0 ? v : 1;
}
public:
std::uint32_t getMaxBlockSize() const
{
return checkPowerOf2(bySegment ? lanes : batchSize);
}
std::uint32_t getBatchSize() const { return batchSize; }
void *getMemory() const { return memory; }
Argon2KernelRunner(std::uint32_t type, std::uint32_t version,
std::uint32_t passes, std::uint32_t lanes,
std::uint32_t segmentBlocks, std::uint32_t batchSize,
bool bySegment);
~Argon2KernelRunner();
void run(std::uint32_t blockSize);
float finish();
};
} // cuda
} // argon2
#endif // ARGON2_CUDA_KERNELS_H
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include <memory> #include <memory>
#include "programcontext.h" #include "programcontext.h"
#include "kernels.h"
#include "argon2-gpu-common/argon2params.h" #include "argon2-gpu-common/argon2params.h"
namespace argon2 { namespace argon2 {
...@@ -16,13 +17,8 @@ private: ...@@ -16,13 +17,8 @@ private:
const Argon2Params *params; const Argon2Params *params;
const Device *device; const Device *device;
std::size_t batchSize; Argon2KernelRunner runner;
std::size_t memorySize; uint32_t bestBlockSize;
bool bySegment;
cudaStream_t stream;
void *memoryBuffer;
public: public:
class PasswordWriter class PasswordWriter
...@@ -58,13 +54,12 @@ public: ...@@ -58,13 +54,12 @@ public:
const void *getHash() const; const void *getHash() const;
}; };
std::size_t getBatchSize() const { return batchSize; } std::size_t getBatchSize() const { return runner.getBatchSize(); }
ProcessingUnit( ProcessingUnit(
const ProgramContext *programContext, const Argon2Params *params, const ProgramContext *programContext, const Argon2Params *params,
const Device *device, std::size_t batchSize, const Device *device, std::size_t batchSize,
bool bySegment = true); bool bySegment = true);
~ProcessingUnit();
void beginProcessing(); void beginProcessing();
void endProcessing(); void endProcessing();
......
...@@ -4,6 +4,9 @@ ...@@ -4,6 +4,9 @@
#endif #endif
#include "kernels.h" #include "kernels.h"
#include "cudaexception.h"
#include <stdexcept>
#define ARGON2_D 0 #define ARGON2_D 0
#define ARGON2_I 1 #define ARGON2_I 1
...@@ -288,21 +291,28 @@ __global__ void argon2_kernel_segment( ...@@ -288,21 +291,28 @@ __global__ void argon2_kernel_segment(
struct block_g *memory, uint32_t passes, uint32_t lanes, struct block_g *memory, uint32_t passes, uint32_t lanes,
uint32_t segment_blocks, uint32_t pass, uint32_t slice) uint32_t segment_blocks, uint32_t pass, uint32_t slice)
{ {
extern __shared__ struct block_l shared_mem[];
struct block_l *shared = shared_mem;
uint32_t job_id = blockIdx.z; uint32_t job_id = blockIdx.z;
uint32_t lane = blockIdx.y; uint32_t lane = blockIdx.y * blockDim.y + threadIdx.y;
uint32_t thread = threadIdx.x; uint32_t thread = threadIdx.x;
uint32_t lane_blocks = ARGON2_SYNC_POINTS * segment_blocks; uint32_t lane_blocks = ARGON2_SYNC_POINTS * segment_blocks;
/* select job's memory region: */ /* select job's memory region: */
memory += job_id * lanes * lane_blocks; memory += job_id * lanes * lane_blocks;
/* select warp's shared memory buffer: */
shared += threadIdx.y * (type == ARGON2_I ? 3 : 2);
uint32_t thread_input; uint32_t thread_input;
__shared__ struct block_l local_curr, local_prev, local_addr; struct block_l *curr = &shared[0];
struct block_l *curr = &local_curr; struct block_l *prev = &shared[1];
struct block_l *prev = &local_prev; struct block_l *addr;
if (type == ARGON2_I) { if (type == ARGON2_I) {
addr = &shared[2];
switch (thread) { switch (thread) {
case 0: case 0:
thread_input = pass; thread_input = pass;
...@@ -331,7 +341,7 @@ __global__ void argon2_kernel_segment( ...@@ -331,7 +341,7 @@ __global__ void argon2_kernel_segment(
if (thread == 6) { if (thread == 6) {
++thread_input; ++thread_input;
} }
next_addresses(thread, &local_addr, curr, thread_input); next_addresses(thread, addr, curr, thread_input);
} }
} }
...@@ -362,7 +372,7 @@ __global__ void argon2_kernel_segment( ...@@ -362,7 +372,7 @@ __global__ void argon2_kernel_segment(
for (uint32_t offset = start_offset; offset < segment_blocks; ++offset) { for (uint32_t offset = start_offset; offset < segment_blocks; ++offset) {
argon2_core<type, version>( argon2_core<type, version>(
memory, mem_curr, curr, prev, &local_addr, memory, mem_curr, curr, prev, addr,
lanes, segment_blocks, lane_blocks, lanes, segment_blocks, lane_blocks,
thread, &thread_input, thread, &thread_input,
lane, pass, slice, offset); lane, pass, slice, offset);
...@@ -379,7 +389,7 @@ __global__ void argon2_kernel_oneshot( ...@@ -379,7 +389,7 @@ __global__ void argon2_kernel_oneshot(
extern __shared__ struct block_l shared_mem[]; extern __shared__ struct block_l shared_mem[];
struct block_l *shared = shared_mem; struct block_l *shared = shared_mem;
uint32_t job_id = blockIdx.z; uint32_t job_id = blockIdx.z * blockDim.z + threadIdx.z;
uint32_t lane = threadIdx.y; uint32_t lane = threadIdx.y;
uint32_t thread = threadIdx.x; uint32_t thread = threadIdx.x;
...@@ -475,68 +485,143 @@ __global__ void argon2_kernel_oneshot( ...@@ -475,68 +485,143 @@ __global__ void argon2_kernel_oneshot(
} }
} }
void argon2_run_kernel_segment( Argon2KernelRunner::Argon2KernelRunner(
uint32_t type, uint32_t version, uint32_t batchSize, uint32_t type, uint32_t version, uint32_t passes, uint32_t lanes,
cudaStream_t stream, void *memory, uint32_t passes, uint32_t lanes, uint32_t segmentBlocks, uint32_t batchSize, bool bySegment)
uint32_t segment_blocks, uint32_t pass, uint32_t slice) : type(type), version(version), passes(passes), lanes(lanes),
segmentBlocks(segmentBlocks), batchSize(batchSize), bySegment(bySegment),
stream(nullptr), memory(nullptr), start(nullptr), end(nullptr)
{
// FIXME: check overflow:
uint32_t memorySize = lanes * segmentBlocks * ARGON2_SYNC_POINTS
* ARGON2_BLOCK_SIZE * batchSize;
CudaException::check(cudaMallocManaged(&memory, memorySize,
cudaMemAttachHost));
CudaException::check(cudaEventCreate(&start));
CudaException::check(cudaEventCreate(&end));
CudaException::check(cudaStreamCreate(&stream));
CudaException::check(cudaStreamAttachMemAsync(stream, memory));
CudaException::check(cudaStreamSynchronize(stream));
}
Argon2KernelRunner::~Argon2KernelRunner()
{
if (start != nullptr) {
cudaEventDestroy(start);
}
if (end != nullptr) {
cudaEventDestroy(end);
}
if (stream != nullptr) {
cudaStreamDestroy(stream);
}
if (memory != nullptr) {
cudaFree(memory);
}
}
void Argon2KernelRunner::runKernelSegment(uint32_t blockSize,
uint32_t pass, uint32_t slice)
{ {
if (blockSize > lanes || lanes % blockSize != 0) {
throw std::logic_error("Invalid blockSize!");
}
struct block_g *memory_blocks = (struct block_g *)memory; struct block_g *memory_blocks = (struct block_g *)memory;
dim3 blocks = dim3(1, lanes, batchSize); dim3 blocks = dim3(1, lanes / blockSize, batchSize);
dim3 threads = dim3(THREADS_PER_LANE); dim3 threads = dim3(THREADS_PER_LANE, blockSize);
if (type == ARGON2_I) { if (type == ARGON2_I) {
uint32_t shared_size = blockSize * ARGON2_BLOCK_SIZE * 3;
if (version == ARGON2_VERSION_10) { if (version == ARGON2_VERSION_10) {
argon2_kernel_segment<ARGON2_I, ARGON2_VERSION_10> argon2_kernel_segment<ARGON2_I, ARGON2_VERSION_10>
<<<blocks, threads, 0, stream>>>(memory_blocks, passes, lanes, <<<blocks, threads, shared_size, stream>>>(
segment_blocks, pass, slice); memory_blocks, passes, lanes, segmentBlocks,
pass, slice);
} else { } else {
argon2_kernel_segment<ARGON2_I, ARGON2_VERSION_13> argon2_kernel_segment<ARGON2_I, ARGON2_VERSION_13>
<<<blocks, threads, 0, stream>>>(memory_blocks, passes, lanes, <<<blocks, threads, shared_size, stream>>>(
segment_blocks, pass, slice); memory_blocks, passes, lanes, segmentBlocks,
pass, slice);
} }
} else { } else {
uint32_t shared_size = blockSize * ARGON2_BLOCK_SIZE * 2;
if (version == ARGON2_VERSION_10) { if (version == ARGON2_VERSION_10) {
argon2_kernel_segment<ARGON2_D, ARGON2_VERSION_10> argon2_kernel_segment<ARGON2_D, ARGON2_VERSION_10>
<<<blocks, threads, 0, stream>>>(memory_blocks, passes, lanes, <<<blocks, threads, shared_size, stream>>>(
segment_blocks, pass, slice); memory_blocks, passes, lanes, segmentBlocks,
pass, slice);
} else { } else {
argon2_kernel_segment<ARGON2_D, ARGON2_VERSION_13> argon2_kernel_segment<ARGON2_D, ARGON2_VERSION_13>
<<<blocks, threads, 0, stream>>>(memory_blocks, passes, lanes, <<<blocks, threads, shared_size, stream>>>(
segment_blocks, pass, slice); memory_blocks, passes, lanes, segmentBlocks,
pass, slice);
} }
} }
} }
void argon2_run_kernel_oneshot( void Argon2KernelRunner::runKernelOneshot(uint32_t blockSize)
uint32_t type, uint32_t version, uint32_t batchSize,
cudaStream_t stream, void *memory, uint32_t passes, uint32_t lanes,
uint32_t segment_blocks)
{ {
if (blockSize > batchSize || batchSize % blockSize != 0) {
throw std::logic_error("Invalid blockSize!");
}
struct block_g *memory_blocks = (struct block_g *)memory; struct block_g *memory_blocks = (struct block_g *)memory;
dim3 blocks = dim3(1, 1, batchSize); dim3 blocks = dim3(1, 1, batchSize / blockSize);
dim3 threads = dim3(THREADS_PER_LANE, lanes); dim3 threads = dim3(THREADS_PER_LANE, lanes, blockSize);
if (type == ARGON2_I) { if (type == ARGON2_I) {
uint32_t shared_size = lanes * ARGON2_BLOCK_SIZE * 3; uint32_t shared_size = lanes * ARGON2_BLOCK_SIZE * 3;
if (version == ARGON2_VERSION_10) { if (version == ARGON2_VERSION_10) {
argon2_kernel_oneshot<ARGON2_I, ARGON2_VERSION_10> argon2_kernel_oneshot<ARGON2_I, ARGON2_VERSION_10>
<<<blocks, threads, shared_size, stream>>>( <<<blocks, threads, shared_size, stream>>>(
memory_blocks, passes, lanes, segment_blocks); memory_blocks, passes, lanes, segmentBlocks);
} else { } else {
argon2_kernel_oneshot<ARGON2_I, ARGON2_VERSION_13> argon2_kernel_oneshot<ARGON2_I, ARGON2_VERSION_13>
<<<blocks, threads, shared_size, stream>>>( <<<blocks, threads, shared_size, stream>>>(
memory_blocks, passes, lanes, segment_blocks); memory_blocks, passes, lanes, segmentBlocks);
} }
} else { } else {
uint32_t shared_size = lanes * ARGON2_BLOCK_SIZE * 2; uint32_t shared_size = lanes * ARGON2_BLOCK_SIZE * 2;
if (version == ARGON2_VERSION_10) { if (version == ARGON2_VERSION_10) {
argon2_kernel_oneshot<ARGON2_D, ARGON2_VERSION_10> argon2_kernel_oneshot<ARGON2_D, ARGON2_VERSION_10>
<<<blocks, threads, shared_size, stream>>>( <<<blocks, threads, shared_size, stream>>>(
memory_blocks, passes, lanes, segment_blocks); memory_blocks, passes, lanes, segmentBlocks);
} else { } else {
argon2_kernel_oneshot<ARGON2_D, ARGON2_VERSION_13> argon2_kernel_oneshot<ARGON2_D, ARGON2_VERSION_13>
<<<blocks, threads, shared_size, stream>>>( <<<blocks, threads, shared_size, stream>>>(
memory_blocks, passes, lanes, segment_blocks); memory_blocks, passes, lanes, segmentBlocks);
}
}
}
void Argon2KernelRunner::run(uint32_t blockSize)
{
CudaException::check(cudaEventRecord(start, stream));
if (bySegment) {
for (uint32_t pass = 0; pass < passes; pass++) {
for (uint32_t slice = 0; slice < ARGON2_SYNC_POINTS; slice++) {
runKernelSegment(blockSize, pass, slice);
}
} }
} else {
runKernelOneshot(blockSize);
} }
CudaException::check(cudaGetLastError());
CudaException::check(cudaEventRecord(end, stream));
}
float Argon2KernelRunner::finish()
{
CudaException::check(cudaStreamSynchronize(stream));
float time = 0.0;
CudaException::check(cudaEventElapsedTime(&time, start, end));
return time;
} }
} // cuda } // cuda
......
#ifndef ARGON2_CUDA_KERNELS_H
#define ARGON2_CUDA_KERNELS_H
#include <cuda_runtime.h>
#include <cstdint>
/* workaround weird CMake/CUDA bug: */
#ifdef argon2
#undef argon2
#endif
namespace argon2 {
namespace cuda {
void argon2_run_kernel_segment(
std::uint32_t type, std::uint32_t version, std::uint32_t batchSize,
cudaStream_t stream, void *memory, std::uint32_t passes,
std::uint32_t lanes, std::uint32_t segment_blocks, std::uint32_t pass,
std::uint32_t slice);
void argon2_run_kernel_oneshot(
std::uint32_t type, std::uint32_t version, std::uint32_t batchSize,
cudaStream_t stream, void *memory, std::uint32_t passes,
std::uint32_t lanes, std::uint32_t segment_blocks);
} // cuda
} // argon2
#endif // ARGON2_CUDA_KERNELS_H
...@@ -3,36 +3,60 @@ ...@@ -3,36 +3,60 @@
#include "cudaexception.h" #include "cudaexception.h"
#include "kernels.h" #include "kernels.h"
#include <limits>
#ifndef NDEBUG
#include <iostream>
#endif
namespace argon2 { namespace argon2 {
namespace cuda { namespace cuda {
ProcessingUnit::ProcessingUnit( ProcessingUnit::ProcessingUnit(
const ProgramContext *programContext, const Argon2Params *params, const ProgramContext *programContext, const Argon2Params *params,
const Device *device, std::size_t batchSize, const Device *device, std::size_t batchSize, bool bySegment)
bool bySegment) : programContext(programContext), params(params), device(device),
: programContext(programContext), params(params), runner(programContext->getArgon2Type(),
device(device), batchSize(batchSize), bySegment(bySegment), programContext->getArgon2Version(), params->getTimeCost(),
stream(nullptr), memoryBuffer(nullptr) params->getLanes(), params->getSegmentBlocks(), batchSize,
bySegment),
bestBlockSize(1)
{ {
// FIXME: check memSize out of bounds CudaException::check(cudaSetDevice(device->getDeviceIndex()));
CudaException::check(cudaStreamCreate(&stream));
if (runner.getMaxBlockSize() > 1) {
memorySize = params->getMemorySize() * batchSize; #ifndef NDEBUG
std::cerr << "[INFO] Benchmarking block size..." << std::endl;
CudaException::check(cudaMallocManaged(&memoryBuffer, memorySize, #endif
cudaMemAttachHost));
float bestTime = std::numeric_limits<float>::infinity();
for (std::uint32_t blockSize = 1; blockSize <= runner.getMaxBlockSize();
blockSize *= 2)
{
float time;
try {
runner.run(blockSize);
time = runner.finish();
} catch(CudaException &ex) {
#ifndef NDEBUG
std::cerr << "[WARN] Exception on block size " << blockSize
<< ": " << ex.what() << std::endl;
#endif
break;
}
CudaException::check(cudaStreamAttachMemAsync(stream, memoryBuffer)); #ifndef NDEBUG
CudaException::check(cudaStreamSynchronize(stream)); std::cerr << "[INFO] Block size " << blockSize << ": "
} << time << " ms" << std::endl;
#endif
ProcessingUnit::~ProcessingUnit() if (time < bestTime) {
{ bestTime = time;
if (stream != nullptr) { bestBlockSize = blockSize;
cudaStreamDestroy(stream); }
} }
if (memoryBuffer != nullptr) { #ifndef NDEBUG
cudaFree(memoryBuffer); std::cerr << "[INFO] Picked block size: " << bestBlockSize << std::endl;
#endif
} }
} }
...@@ -41,7 +65,7 @@ ProcessingUnit::PasswordWriter::PasswordWriter( ...@@ -41,7 +65,7 @@ ProcessingUnit::PasswordWriter::PasswordWriter(
: params(parent.params), : params(parent.params),
type(parent.programContext->getArgon2Type()), type(parent.programContext->getArgon2Type()),
version(parent.programContext->getArgon2Version()), version(parent.programContext->getArgon2Version()),
dest(static_cast<std::uint8_t *>(parent.memoryBuffer)) dest(static_cast<std::uint8_t *>(parent.runner.getMemory()))
{ {
dest += index * params->getMemorySize(); dest += index * params->getMemorySize();
} }
...@@ -65,7 +89,7 @@ void ProcessingUnit::PasswordWriter::setPassword( ...@@ -65,7 +89,7 @@ void ProcessingUnit::PasswordWriter::setPassword(
ProcessingUnit::HashReader::HashReader( ProcessingUnit::HashReader::HashReader(
ProcessingUnit &parent, std::size_t index) ProcessingUnit &parent, std::size_t index)
: params(parent.params), : params(parent.params),
src(static_cast<const std::uint8_t *>(parent.memoryBuffer)), src(static_cast<const std::uint8_t *>(parent.runner.getMemory())),
buffer(new std::uint8_t[params->getOutputLength()]) buffer(new std::uint8_t[params->getOutputLength()])
{ {
src += index * params->getMemorySize(); src += index * params->getMemorySize();
...@@ -89,33 +113,13 @@ const void *ProcessingUnit::HashReader::getHash() const ...@@ -89,33 +113,13 @@ const void *ProcessingUnit::HashReader::getHash() const
void ProcessingUnit::beginProcessing() void ProcessingUnit::beginProcessing()
{ {
if (bySegment) { CudaException::check(cudaSetDevice(device->getDeviceIndex()));
for (unsigned int pass = 0; pass < params->getTimeCost(); pass++) { runner.run(bestBlockSize);
for (unsigned int slice = 0; slice < ARGON2_SYNC_POINTS; slice++) {
argon2_run_kernel_segment(
programContext->getArgon2Type(),
programContext->getArgon2Version(),
batchSize, stream, (unsigned long *)memoryBuffer,
params->getTimeCost(),
params->getLanes(),
params->getSegmentBlocks(),
pass, slice);
}
}
} else {
argon2_run_kernel_oneshot(
programContext->getArgon2Type(),
programContext->getArgon2Version(),
batchSize, stream, (unsigned long *)memoryBuffer,
params->getTimeCost(),
params->getLanes(),
params->getSegmentBlocks());
}
} }
void ProcessingUnit::endProcessing() void ProcessingUnit::endProcessing()
{ {
CudaException::check(cudaStreamSynchronize(stream)); runner.finish();
} }
} // namespace cuda } // namespace cuda
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment