Commit dde1302f authored by Ondrej Mosnáček's avatar Ondrej Mosnáček

Use more efficient OpenCL memory mapping

This commit changes the memory layout to be lane-interleaved, so that
we can map only the necessary parts of GPU memory with OpenCL.
Previously we mapped the whole memory buffer, which was slow as hell...
parent 03e96e0d
......@@ -346,11 +346,11 @@ void compute_ref_pos(uint lanes, uint segment_blocks,
void argon2_core(
__global struct block_g *memory, __global struct block_g *mem_curr,
struct block_th *prev, struct block_th *tmp,
__local struct u64_shuffle_buf *shuffle_buf, uint lane_blocks,
__local struct u64_shuffle_buf *shuffle_buf, uint lanes,
uint thread, uint pass, uint ref_index, uint ref_lane)
{
__global struct block_g *mem_ref;
mem_ref = memory + ref_lane * lane_blocks + ref_index;
mem_ref = memory + ref_index * lanes + ref_lane;
#if ARGON2_VERSION == ARGON2_VERSION_10
load_block_xor(prev, mem_ref, thread);
......@@ -486,9 +486,8 @@ void argon2_step_precompute(
struct block_th *prev, struct block_th *tmp,
__local struct u64_shuffle_buf *shuffle_buf,
__global const struct ref **refs,
uint lanes, uint segment_blocks, uint lane_blocks,
uint thread, uint lane, uint pass, uint slice,
uint offset)
uint lanes, uint segment_blocks, uint thread,
uint lane, uint pass, uint slice, uint offset)
{
uint ref_index, ref_lane;
bool data_independent;
......@@ -512,8 +511,8 @@ void argon2_step_precompute(
&ref_lane, &ref_index);
}
argon2_core(memory, mem_curr, prev, tmp, shuffle_buf, lane_blocks,
thread, pass, ref_index, ref_lane);
argon2_core(memory, mem_curr, prev, tmp, shuffle_buf, lanes, thread, pass,
ref_index, ref_lane);
}
__kernel void argon2_kernel_segment_precompute(
......@@ -537,20 +536,20 @@ __kernel void argon2_kernel_segment_precompute(
struct block_th prev, tmp;
__global struct block_g *mem_segment =
memory + lane * lane_blocks + slice * segment_blocks;
memory + slice * segment_blocks * lanes + lane;
__global struct block_g *mem_prev, *mem_curr;
uint start_offset = 0;
if (pass == 0) {
if (slice == 0) {
mem_prev = mem_segment + 1;
mem_curr = mem_segment + 2;
mem_prev = mem_segment + 1 * lanes;
mem_curr = mem_segment + 2 * lanes;
start_offset = 2;
} else {
mem_prev = mem_segment - 1;
mem_prev = mem_segment - lanes;
mem_curr = mem_segment;
}
} else {
mem_prev = mem_segment + (slice == 0 ? lane_blocks : 0) - 1;
mem_prev = mem_segment + (slice == 0 ? lane_blocks * lanes : 0) - lanes;
mem_curr = mem_segment;
}
......@@ -569,10 +568,9 @@ __kernel void argon2_kernel_segment_precompute(
for (uint offset = start_offset; offset < segment_blocks; ++offset) {
argon2_step_precompute(
memory, mem_curr, &prev, &tmp, shuffle_buf, &refs, lanes,
segment_blocks, lane_blocks, thread,
lane, pass, slice, offset);
segment_blocks, thread, lane, pass, slice, offset);
++mem_curr;
mem_curr += lanes;
}
}
......@@ -595,9 +593,9 @@ __kernel void argon2_kernel_oneshot_precompute(
struct block_th prev, tmp;
__global struct block_g *mem_lane = memory + lane * lane_blocks;
__global struct block_g *mem_prev = mem_lane + 1;
__global struct block_g *mem_curr = mem_lane + 2;
__global struct block_g *mem_lane = memory + lane;
__global struct block_g *mem_prev = mem_lane + 1 * lanes;
__global struct block_g *mem_curr = mem_lane + 2 * lanes;
load_block(&prev, mem_prev, thread);
......@@ -618,10 +616,10 @@ __kernel void argon2_kernel_oneshot_precompute(
argon2_step_precompute(
memory, mem_curr, &prev, &tmp, shuffle_buf, &refs,
lanes, segment_blocks, lane_blocks, thread, lane,
pass, slice, offset);
lanes, segment_blocks, thread,
lane, pass, slice, offset);
++mem_curr;
mem_curr += lanes;
}
barrier(CLK_LOCAL_MEM_FENCE);
......@@ -636,8 +634,7 @@ void argon2_step(
__global struct block_g *memory, __global struct block_g *mem_curr,
struct block_th *prev, struct block_th *tmp, struct block_th *addr,
__local struct u64_shuffle_buf *shuffle_buf,
uint lanes, uint segment_blocks, uint lane_blocks,
uint thread, uint *thread_input,
uint lanes, uint segment_blocks, uint thread, uint *thread_input,
uint lane, uint pass, uint slice, uint offset)
{
uint ref_index, ref_lane;
......@@ -674,8 +671,8 @@ void argon2_step(
compute_ref_pos(lanes, segment_blocks, pass, lane, slice, offset,
&ref_lane, &ref_index);
argon2_core(memory, mem_curr, prev, tmp, shuffle_buf, lane_blocks,
thread, pass, ref_index, ref_lane);
argon2_core(memory, mem_curr, prev, tmp, shuffle_buf, lanes, thread, pass,
ref_index, ref_lane);
}
__kernel void argon2_kernel_segment(
......@@ -732,20 +729,20 @@ __kernel void argon2_kernel_segment(
#endif
__global struct block_g *mem_segment =
memory + lane * lane_blocks + slice * segment_blocks;
memory + slice * segment_blocks * lanes + lane;
__global struct block_g *mem_prev, *mem_curr;
uint start_offset = 0;
if (pass == 0) {
if (slice == 0) {
mem_prev = mem_segment + 1;
mem_curr = mem_segment + 2;
mem_prev = mem_segment + 1 * lanes;
mem_curr = mem_segment + 2 * lanes;
start_offset = 2;
} else {
mem_prev = mem_segment - 1;
mem_prev = mem_segment - lanes;
mem_curr = mem_segment;
}
} else {
mem_prev = mem_segment + (slice == 0 ? lane_blocks : 0) - 1;
mem_prev = mem_segment + (slice == 0 ? lane_blocks * lanes : 0) - lanes;
mem_curr = mem_segment;
}
......@@ -753,11 +750,10 @@ __kernel void argon2_kernel_segment(
for (uint offset = start_offset; offset < segment_blocks; ++offset) {
argon2_step(memory, mem_curr, &prev, &tmp, &addr, shuffle_buf,
lanes, segment_blocks, lane_blocks,
thread, &thread_input,
lanes, segment_blocks, thread, &thread_input,
lane, pass, slice, offset);
++mem_curr;
mem_curr += lanes;
}
}
......@@ -808,9 +804,9 @@ __kernel void argon2_kernel_oneshot(
}
#endif
__global struct block_g *mem_lane = memory + lane * lane_blocks;
__global struct block_g *mem_prev = mem_lane + 1;
__global struct block_g *mem_curr = mem_lane + 2;
__global struct block_g *mem_lane = memory + lane;
__global struct block_g *mem_prev = mem_lane + 1 * lanes;
__global struct block_g *mem_curr = mem_lane + 2 * lanes;
load_block(&prev, mem_prev, thread);
......@@ -824,11 +820,10 @@ __kernel void argon2_kernel_oneshot(
}
argon2_step(memory, mem_curr, &prev, &tmp, &addr, shuffle_buf,
lanes, segment_blocks, lane_blocks,
thread, &thread_input,
lanes, segment_blocks, thread, &thread_input,
lane, pass, slice, offset);
++mem_curr;
mem_curr += lanes;
}
barrier(CLK_LOCAL_MEM_FENCE);
......
......@@ -22,39 +22,6 @@ private:
uint32_t bestJobsPerBlock;
public:
class PasswordWriter
{
private:
const Argon2Params *params;
Type type;
Version version;
std::uint8_t *dest;
public:
PasswordWriter(ProcessingUnit &parent, std::size_t index = 0);
void moveForward(std::size_t offset);
void moveBackwards(std::size_t offset);
void setPassword(const void *pw, std::size_t pwSize) const;
};
class HashReader
{
private:
const Argon2Params *params;
const std::uint8_t *src;
std::unique_ptr<uint8_t[]> buffer;
public:
HashReader(ProcessingUnit &parent, std::size_t index = 0);
void moveForward(std::size_t offset);
void moveBackwards(std::size_t offset);
const void *getHash() const;
};
std::size_t getBatchSize() const { return runner.getBatchSize(); }
ProcessingUnit(
......@@ -62,6 +29,9 @@ public:
const Device *device, std::size_t batchSize,
bool bySegment = true, bool precomputeRefs = false);
void setPassword(std::size_t index, const void *pw, std::size_t pwSize);
void getHash(std::size_t index, void *hash);
void beginProcessing();
void endProcessing();
};
......
......@@ -22,7 +22,6 @@ private:
cl::Buffer memoryBuffer, refsBuffer;
cl::Event start, end;
void *memory;
std::uint32_t memorySize;
void precomputeRefs();
......@@ -38,12 +37,17 @@ public:
std::uint32_t getMaxJobsPerBlock() const { return batchSize; }
std::uint32_t getBatchSize() const { return batchSize; }
void *getMemory() const { return memory; }
KernelRunner(const ProgramContext *programContext,
const Argon2Params *params, const Device *device,
std::uint32_t batchSize, bool bySegment, bool precompute);
void *mapInputMemory(std::uint32_t jobId);
void unmapInputMemory(void *memory);
void *mapOutputMemory(std::uint32_t jobId);
void unmapOutputMemory(void *memory);
void run(std::uint32_t lanesPerBlock, std::uint32_t jobsPerBlock);
float finish();
};
......
......@@ -20,39 +20,6 @@ private:
std::uint32_t bestJobsPerBlock;
public:
class PasswordWriter
{
private:
const Argon2Params *params;
Type type;
Version version;
std::uint8_t *dest;
public:
PasswordWriter(ProcessingUnit &parent, std::size_t index = 0);
void moveForward(std::size_t offset);
void moveBackwards(std::size_t offset);
void setPassword(const void *pw, std::size_t pwSize) const;
};
class HashReader
{
private:
const Argon2Params *params;
const std::uint8_t *src;
std::unique_ptr<uint8_t[]> buffer;
public:
HashReader(ProcessingUnit &parent, std::size_t index = 0);
void moveForward(std::size_t offset);
void moveBackwards(std::size_t offset);
const void *getHash() const;
};
std::size_t getBatchSize() const { return runner.getBatchSize(); }
ProcessingUnit(
......@@ -60,6 +27,9 @@ public:
const Device *device, std::size_t batchSize,
bool bySegment = true, bool precomputeRefs = false);
void setPassword(std::size_t index, const void *pw, std::size_t pwSize);
void getHash(std::size_t index, void *hash);
void beginProcessing();
void endProcessing();
};
......
......@@ -423,10 +423,10 @@ template<uint32_t version>
__device__ void argon2_core(
struct block_g *memory, struct block_g *mem_curr,
struct block_th *prev, struct block_th *tmp,
struct u64_shuffle_buf *shuffle_buf, uint32_t lane_blocks,
struct u64_shuffle_buf *shuffle_buf, uint32_t lanes,
uint32_t thread, uint32_t pass, uint32_t ref_index, uint32_t ref_lane)
{
struct block_g *mem_ref = memory + ref_lane * lane_blocks + ref_index;
struct block_g *mem_ref = memory + ref_index * lanes + ref_lane;
if (version != ARGON2_VERSION_10 && pass != 0) {
load_block(tmp, mem_curr, thread);
......@@ -449,9 +449,8 @@ __device__ void argon2_step_precompute(
struct block_g *memory, struct block_g *mem_curr,
struct block_th *prev, struct block_th *tmp,
struct u64_shuffle_buf *shuffle_buf, const struct ref **refs,
uint32_t lanes, uint32_t segment_blocks, uint32_t lane_blocks,
uint32_t thread, uint32_t lane, uint32_t pass, uint32_t slice,
uint32_t offset)
uint32_t lanes, uint32_t segment_blocks, uint32_t thread,
uint32_t lane, uint32_t pass, uint32_t slice, uint32_t offset)
{
uint32_t ref_index, ref_lane;
if (type == ARGON2_I || (type == ARGON2_ID && pass == 0 &&
......@@ -468,7 +467,7 @@ __device__ void argon2_step_precompute(
&ref_lane, &ref_index);
}
argon2_core<version>(memory, mem_curr, prev, tmp, shuffle_buf, lane_blocks,
argon2_core<version>(memory, mem_curr, prev, tmp, shuffle_buf, lanes,
thread, pass, ref_index, ref_lane);
}
......@@ -493,20 +492,20 @@ __global__ void argon2_kernel_segment_precompute(
struct block_th prev, tmp;
struct block_g *mem_segment =
memory + lane * lane_blocks + slice * segment_blocks;
memory + slice * segment_blocks * lanes + lane;
struct block_g *mem_prev, *mem_curr;
uint32_t start_offset = 0;
if (pass == 0) {
if (slice == 0) {
mem_prev = mem_segment + 1;
mem_curr = mem_segment + 2;
mem_prev = mem_segment + 1 * lanes;
mem_curr = mem_segment + 2 * lanes;
start_offset = 2;
} else {
mem_prev = mem_segment - 1;
mem_prev = mem_segment - lanes;
mem_curr = mem_segment;
}
} else {
mem_prev = mem_segment + (slice == 0 ? lane_blocks : 0) - 1;
mem_prev = mem_segment + (slice == 0 ? lane_blocks * lanes : 0) - lanes;
mem_curr = mem_segment;
}
......@@ -525,10 +524,9 @@ __global__ void argon2_kernel_segment_precompute(
for (uint32_t offset = start_offset; offset < segment_blocks; ++offset) {
argon2_step_precompute<type, version>(
memory, mem_curr, &prev, &tmp, shuffle_buf, &refs, lanes,
segment_blocks, lane_blocks, thread,
lane, pass, slice, offset);
segment_blocks, thread, lane, pass, slice, offset);
++mem_curr;
mem_curr += lanes;
}
}
......@@ -551,9 +549,9 @@ __global__ void argon2_kernel_oneshot_precompute(
struct block_th prev, tmp;
struct block_g *mem_lane = memory + lane * lane_blocks;
struct block_g *mem_prev = mem_lane + 1;
struct block_g *mem_curr = mem_lane + 2;
struct block_g *mem_lane = memory + lane;
struct block_g *mem_prev = mem_lane + 1 * lanes;
struct block_g *mem_curr = mem_lane + 2 * lanes;
load_block(&prev, mem_prev, thread);
......@@ -574,10 +572,10 @@ __global__ void argon2_kernel_oneshot_precompute(
argon2_step_precompute<type, version>(
memory, mem_curr, &prev, &tmp, shuffle_buf, &refs,
lanes, segment_blocks, lane_blocks, thread, lane,
pass, slice, offset);
lanes, segment_blocks, thread,
lane, pass, slice, offset);
++mem_curr;
mem_curr += lanes;
}
__syncthreads();
......@@ -591,9 +589,8 @@ template<uint32_t type, uint32_t version>
__device__ void argon2_step(
struct block_g *memory, struct block_g *mem_curr,
struct block_th *prev, struct block_th *tmp, struct block_th *addr,
struct u64_shuffle_buf *shuffle_buf,
uint32_t lanes, uint32_t segment_blocks, uint32_t lane_blocks,
uint32_t thread, uint32_t *thread_input,
struct u64_shuffle_buf *shuffle_buf, uint32_t lanes,
uint32_t segment_blocks, uint32_t thread, uint32_t *thread_input,
uint32_t lane, uint32_t pass, uint32_t slice, uint32_t offset)
{
uint32_t ref_index, ref_lane;
......@@ -624,7 +621,7 @@ __device__ void argon2_step(
compute_ref_pos(lanes, segment_blocks, pass, lane, slice, offset,
&ref_lane, &ref_index);
argon2_core<version>(memory, mem_curr, prev, tmp, shuffle_buf, lane_blocks,
argon2_core<version>(memory, mem_curr, prev, tmp, shuffle_buf, lanes,
thread, pass, ref_index, ref_lane);
}
......@@ -682,20 +679,20 @@ __global__ void argon2_kernel_segment(
}
struct block_g *mem_segment =
memory + lane * lane_blocks + slice * segment_blocks;
memory + slice * segment_blocks * lanes + lane;
struct block_g *mem_prev, *mem_curr;
uint32_t start_offset = 0;
if (pass == 0) {
if (slice == 0) {
mem_prev = mem_segment + 1;
mem_curr = mem_segment + 2;
mem_prev = mem_segment + 1 * lanes;
mem_curr = mem_segment + 2 * lanes;
start_offset = 2;
} else {
mem_prev = mem_segment - 1;
mem_prev = mem_segment - lanes;
mem_curr = mem_segment;
}
} else {
mem_prev = mem_segment + (slice == 0 ? lane_blocks : 0) - 1;
mem_prev = mem_segment + (slice == 0 ? lane_blocks * lanes : 0) - lanes;
mem_curr = mem_segment;
}
......@@ -704,11 +701,10 @@ __global__ void argon2_kernel_segment(
for (uint32_t offset = start_offset; offset < segment_blocks; ++offset) {
argon2_step<type, version>(
memory, mem_curr, &prev, &tmp, &addr, shuffle_buf,
lanes, segment_blocks, lane_blocks,
thread, &thread_input,
lanes, segment_blocks, thread, &thread_input,
lane, pass, slice, offset);
++mem_curr;
mem_curr += lanes;
}
}
......@@ -759,9 +755,9 @@ __global__ void argon2_kernel_oneshot(
}
}
struct block_g *mem_lane = memory + lane * lane_blocks;
struct block_g *mem_prev = mem_lane + 1;
struct block_g *mem_curr = mem_lane + 2;
struct block_g *mem_lane = memory + lane;
struct block_g *mem_prev = mem_lane + 1 * lanes;
struct block_g *mem_curr = mem_lane + 2 * lanes;
load_block(&prev, mem_prev, thread);
......@@ -776,11 +772,10 @@ __global__ void argon2_kernel_oneshot(
argon2_step<type, version>(
memory, mem_curr, &prev, &tmp, &addr, shuffle_buf,
lanes, segment_blocks, lane_blocks,
thread, &thread_input,
lanes, segment_blocks, thread, &thread_input,
lane, pass, slice, offset);
++mem_curr;
mem_curr += lanes;
}
__syncthreads();
......
......@@ -31,14 +31,9 @@ ProcessingUnit::ProcessingUnit(
CudaException::check(cudaSetDevice(device->getDeviceIndex()));
}
auto memory = static_cast<std::uint8_t *>(runner.getMemory());
/* pre-fill first blocks with pseudo-random data: */
for (std::size_t i = 0; i < batchSize; i++) {
params->fillFirstBlocks(memory, NULL, 0,
programContext->getArgon2Type(),
programContext->getArgon2Version());
memory += params->getMemorySize();
setPassword(i, NULL, 0);
}
if (runner.getMaxLanesPerBlock() > runner.getMinLanesPerBlock()) {
......@@ -118,55 +113,22 @@ ProcessingUnit::ProcessingUnit(
}
}
ProcessingUnit::PasswordWriter::PasswordWriter(
ProcessingUnit &parent, std::size_t index)
: params(parent.params),
type(parent.programContext->getArgon2Type()),
version(parent.programContext->getArgon2Version()),
dest(static_cast<std::uint8_t *>(parent.runner.getMemory()))
{
dest += index * params->getMemorySize();
}
void ProcessingUnit::PasswordWriter::moveForward(std::size_t offset)
{
dest += offset * params->getMemorySize();
}
void ProcessingUnit::PasswordWriter::moveBackwards(std::size_t offset)
{
dest -= offset * params->getMemorySize();
}
void ProcessingUnit::PasswordWriter::setPassword(
const void *pw, std::size_t pwSize) const
void ProcessingUnit::setPassword(std::size_t index, const void *pw,
std::size_t pwSize)
{
params->fillFirstBlocks(dest, pw, pwSize, type, version);
}
ProcessingUnit::HashReader::HashReader(
ProcessingUnit &parent, std::size_t index)
: params(parent.params),
src(static_cast<const std::uint8_t *>(parent.runner.getMemory())),
buffer(new std::uint8_t[params->getOutputLength()])
{
src += index * params->getMemorySize();
}
void ProcessingUnit::HashReader::moveForward(std::size_t offset)
{
src += offset * params->getMemorySize();
}
void ProcessingUnit::HashReader::moveBackwards(std::size_t offset)
{
src -= offset * params->getMemorySize();
auto memory = static_cast<std::uint8_t *>(runner.getMemory());
memory += index * params->getMemorySize();
params->fillFirstBlocks(memory, pw, pwSize,
programContext->getArgon2Type(),
programContext->getArgon2Version());
}
const void *ProcessingUnit::HashReader::getHash() const
void ProcessingUnit::getHash(std::size_t index, void *hash)
{
params->finalize(buffer.get(), src);
return buffer.get();
auto memory = static_cast<std::uint8_t *>(runner.getMemory());
memory += (index + 1) * params->getMemorySize();
memory -= params->getLanes() * ARGON2_BLOCK_SIZE;
params->finalize(hash, memory);
}
void ProcessingUnit::beginProcessing()
......
......@@ -119,43 +119,45 @@ void Argon2Params::fillFirstBlocks(
#endif
auto bmemory = static_cast<std::uint8_t *>(memory);
for (std::uint32_t l = 0; l < lanes; l++) {
auto block_start = bmemory;
store32(initHash + ARGON2_PREHASH_DIGEST_LENGTH, 0);
for (std::uint32_t l = 0; l < lanes; l++) {
store32(initHash + ARGON2_PREHASH_DIGEST_LENGTH + 4, l);
digestLong(block_start, ARGON2_BLOCK_SIZE, initHash, sizeof(initHash));
digestLong(bmemory, ARGON2_BLOCK_SIZE, initHash, sizeof(initHash));
#ifdef DEBUG
std::fprintf(stderr, "Initial block 0 for lane %u: {\n", (unsigned)l);
for (std::size_t i = 0; i < ARGON2_BLOCK_SIZE / 8; i++) {
std::fprintf(stderr, " 0x");
for (std::size_t k = 0; k < 8; k++) {
std::fprintf(stderr, "%02x", (unsigned)block_start[i * 8 + 7 - k]);
std::fprintf(stderr, "%02x", (unsigned)bmemory[i * 8 + 7 - k]);
}
std::fprintf(stderr, "UL,\n");
}
std::fprintf(stderr, "}\n");
#endif
block_start += ARGON2_BLOCK_SIZE;
bmemory += ARGON2_BLOCK_SIZE;
}
store32(initHash + ARGON2_PREHASH_DIGEST_LENGTH, 1);
digestLong(block_start, ARGON2_BLOCK_SIZE, initHash, sizeof(initHash));
for (std::uint32_t l = 0; l < lanes; l++) {
store32(initHash + ARGON2_PREHASH_DIGEST_LENGTH + 4, l);
digestLong(bmemory, ARGON2_BLOCK_SIZE, initHash, sizeof(initHash));
#ifdef DEBUG
std::fprintf(stderr, "Initial block 1 for lane %u: {\n", (unsigned)l);
for (std::size_t i = 0; i < ARGON2_BLOCK_SIZE / 8; i++) {
std::fprintf(stderr, " 0x");
for (std::size_t k = 0; k < 8; k++) {
std::fprintf(stderr, "%02x", (unsigned)block_start[i * 8 + 7 - k]);
std::fprintf(stderr, "%02x", (unsigned)bmemory[i * 8 + 7 - k]);
}
std::fprintf(stderr, "UL,\n");
}
std::fprintf(stderr, "}\n");
#endif
bmemory += ARGON2_BLOCK_SIZE * getLaneBlocks();
bmemory += ARGON2_BLOCK_SIZE;
}
}
......@@ -169,21 +171,18 @@ void Argon2Params::finalize(void *out, const void *memory) const
auto cursor = static_cast<const block *>(memory);
#ifdef DEBUG
for (std::size_t i = 0; i < getMemoryBlocks(); i++) {
for (std::size_t l = 0; l < getLanes(); l++) {
for (std::size_t k = 0; k < ARGON2_BLOCK_SIZE / 8; k++) {
std::fprintf(stderr, "Block %04u [%3u]: %016llx\n",
(unsigned)i, (unsigned)k,
(unsigned long long)cursor[i].v[k]);
(unsigned long long)cursor[l].v[k]);
}
}
#endif
cursor = static_cast<const block *>(memory);
cursor += getLaneBlocks() - 1;
block xored = *cursor;
for (std::uint32_t l = 1; l < lanes; l++) {
cursor += getLaneBlocks();
++cursor;
for (std::size_t i = 0; i < ARGON2_BLOCK_SIZE / 8; i++) {
xored.v[i] ^= cursor->v[i];
}
......
......@@ -31,9 +31,6 @@ KernelRunner::KernelRunner(const ProgramContext *programContext,
CL_QUEUE_PROFILING_ENABLE);
memoryBuffer = cl::Buffer(context, CL_MEM_READ_WRITE, memorySize);
memory = queue.enqueueMapBuffer(memoryBuffer, true, CL_MAP_WRITE, 0,
memorySize);
Type type = programContext->getArgon2Type();
if ((type == ARGON2_I || type == ARGON2_ID) && precompute) {
uint32_t segments =
......@@ -107,6 +104,33 @@ void KernelRunner::precomputeRefs()
queue.finish();
}
void *KernelRunner::mapInputMemory(std::uint32_t jobId)
{
std::size_t memorySize = params->getMemorySize();
std::size_t mappedSize = params->getLanes() * 2 * ARGON2_BLOCK_SIZE;
return queue.enqueueMapBuffer(memoryBuffer, true, CL_MAP_WRITE,
memorySize * jobId, mappedSize);
}
void KernelRunner::unmapInputMemory(void *memory)
{
queue.enqueueUnmapMemObject(memoryBuffer, memory);
}
void *KernelRunner::mapOutputMemory(std::uint32_t jobId)
{
std::size_t memorySize = params->getMemorySize();
std::size_t mappedSize = params->getLanes() * ARGON2_BLOCK_SIZE;
std::size_t mappedOffset = memorySize * (jobId + 1) - mappedSize;
return queue.enqueueMapBuffer(memoryBuffer, true, CL_MAP_WRITE,
mappedOffset, mappedSize);
}
void KernelRunner::unmapOutputMemory(void *memory)
{
queue.enqueueUnmapMemObject(memoryBuffer, memory);
}
void KernelRunner::run(std::uint32_t lanesPerBlock, std::uint32_t jobsPerBlock)
{
std::uint32_t lanes = params->getLanes();
......@@ -129,18 +153,15 @@ void KernelRunner::run(std::uint32_t lanesPerBlock, std::uint32_t jobsPerBlock)
cl::NDRange globalRange { THREADS_PER_LANE * lanes, batchSize };
cl::NDRange localRange { THREADS_PER_LANE * lanesPerBlock, jobsPerBlock };
// FIXME: map only necessary parts of memory buffer
queue.enqueueUnmapMemObject(memoryBuffer, memory, nullptr, &start);
queue.enqueueMarker(&start);
try {
std::uint32_t shmemSize =
THREADS_PER_LANE * lanesPerBlock * jobsPerBlock *
sizeof(cl_uint) * 2;
kernel.setArg<cl::LocalSpaceArg>(0, { shmemSize });
if (bySegment) {
for (std::uint32_t pass = 0; pass < passes; pass++) {
for (std::uint32_t slice = 0; slice < ARGON2_SYNC_POINTS;
slice++) {
for (std::uint32_t slice = 0; slice < ARGON2_SYNC_POINTS; slice++) {
kernel.setArg<cl_uint>(precompute ? 6 : 5, pass);
kernel.setArg<cl_uint>(precompute ? 7 : 6, slice);
queue.enqueueNDRangeKernel(kernel, cl::NullRange,
......@@ -151,24 +172,16 @@ void KernelRunner::run(std::uint32_t lanesPerBlock, std::uint32_t jobsPerBlock)
queue.enqueueNDRangeKernel(kernel, cl::NullRange,
globalRange, localRange);
}
} catch (const cl::Error &err) {
memory = queue.enqueueMapBuffer(
memoryBuffer, true, CL_MAP_READ | CL_MAP_WRITE,
0, memorySize);
throw err;
}
memory = queue.enqueueMapBuffer(
memoryBuffer, false, CL_MAP_READ | CL_MAP_WRITE,