Commit dde1302f authored by Ondrej Mosnáček's avatar Ondrej Mosnáček

Use more efficient OpenCL memory mapping

This commit changes the memory layout to be lane-interleaved, so that
we can map only the necessary parts of GPU memory with OpenCL.
Previously we mapped the whole memory buffer, which was slow as hell...
parent 03e96e0d
...@@ -346,11 +346,11 @@ void compute_ref_pos(uint lanes, uint segment_blocks, ...@@ -346,11 +346,11 @@ void compute_ref_pos(uint lanes, uint segment_blocks,
void argon2_core( void argon2_core(
__global struct block_g *memory, __global struct block_g *mem_curr, __global struct block_g *memory, __global struct block_g *mem_curr,
struct block_th *prev, struct block_th *tmp, struct block_th *prev, struct block_th *tmp,
__local struct u64_shuffle_buf *shuffle_buf, uint lane_blocks, __local struct u64_shuffle_buf *shuffle_buf, uint lanes,
uint thread, uint pass, uint ref_index, uint ref_lane) uint thread, uint pass, uint ref_index, uint ref_lane)
{ {
__global struct block_g *mem_ref; __global struct block_g *mem_ref;
mem_ref = memory + ref_lane * lane_blocks + ref_index; mem_ref = memory + ref_index * lanes + ref_lane;
#if ARGON2_VERSION == ARGON2_VERSION_10 #if ARGON2_VERSION == ARGON2_VERSION_10
load_block_xor(prev, mem_ref, thread); load_block_xor(prev, mem_ref, thread);
...@@ -486,9 +486,8 @@ void argon2_step_precompute( ...@@ -486,9 +486,8 @@ void argon2_step_precompute(
struct block_th *prev, struct block_th *tmp, struct block_th *prev, struct block_th *tmp,
__local struct u64_shuffle_buf *shuffle_buf, __local struct u64_shuffle_buf *shuffle_buf,
__global const struct ref **refs, __global const struct ref **refs,
uint lanes, uint segment_blocks, uint lane_blocks, uint lanes, uint segment_blocks, uint thread,
uint thread, uint lane, uint pass, uint slice, uint lane, uint pass, uint slice, uint offset)
uint offset)
{ {
uint ref_index, ref_lane; uint ref_index, ref_lane;
bool data_independent; bool data_independent;
...@@ -512,8 +511,8 @@ void argon2_step_precompute( ...@@ -512,8 +511,8 @@ void argon2_step_precompute(
&ref_lane, &ref_index); &ref_lane, &ref_index);
} }
argon2_core(memory, mem_curr, prev, tmp, shuffle_buf, lane_blocks, argon2_core(memory, mem_curr, prev, tmp, shuffle_buf, lanes, thread, pass,
thread, pass, ref_index, ref_lane); ref_index, ref_lane);
} }
__kernel void argon2_kernel_segment_precompute( __kernel void argon2_kernel_segment_precompute(
...@@ -537,20 +536,20 @@ __kernel void argon2_kernel_segment_precompute( ...@@ -537,20 +536,20 @@ __kernel void argon2_kernel_segment_precompute(
struct block_th prev, tmp; struct block_th prev, tmp;
__global struct block_g *mem_segment = __global struct block_g *mem_segment =
memory + lane * lane_blocks + slice * segment_blocks; memory + slice * segment_blocks * lanes + lane;
__global struct block_g *mem_prev, *mem_curr; __global struct block_g *mem_prev, *mem_curr;
uint start_offset = 0; uint start_offset = 0;
if (pass == 0) { if (pass == 0) {
if (slice == 0) { if (slice == 0) {
mem_prev = mem_segment + 1; mem_prev = mem_segment + 1 * lanes;
mem_curr = mem_segment + 2; mem_curr = mem_segment + 2 * lanes;
start_offset = 2; start_offset = 2;
} else { } else {
mem_prev = mem_segment - 1; mem_prev = mem_segment - lanes;
mem_curr = mem_segment; mem_curr = mem_segment;
} }
} else { } else {
mem_prev = mem_segment + (slice == 0 ? lane_blocks : 0) - 1; mem_prev = mem_segment + (slice == 0 ? lane_blocks * lanes : 0) - lanes;
mem_curr = mem_segment; mem_curr = mem_segment;
} }
...@@ -569,10 +568,9 @@ __kernel void argon2_kernel_segment_precompute( ...@@ -569,10 +568,9 @@ __kernel void argon2_kernel_segment_precompute(
for (uint offset = start_offset; offset < segment_blocks; ++offset) { for (uint offset = start_offset; offset < segment_blocks; ++offset) {
argon2_step_precompute( argon2_step_precompute(
memory, mem_curr, &prev, &tmp, shuffle_buf, &refs, lanes, memory, mem_curr, &prev, &tmp, shuffle_buf, &refs, lanes,
segment_blocks, lane_blocks, thread, segment_blocks, thread, lane, pass, slice, offset);
lane, pass, slice, offset);
++mem_curr; mem_curr += lanes;
} }
} }
...@@ -595,9 +593,9 @@ __kernel void argon2_kernel_oneshot_precompute( ...@@ -595,9 +593,9 @@ __kernel void argon2_kernel_oneshot_precompute(
struct block_th prev, tmp; struct block_th prev, tmp;
__global struct block_g *mem_lane = memory + lane * lane_blocks; __global struct block_g *mem_lane = memory + lane;
__global struct block_g *mem_prev = mem_lane + 1; __global struct block_g *mem_prev = mem_lane + 1 * lanes;
__global struct block_g *mem_curr = mem_lane + 2; __global struct block_g *mem_curr = mem_lane + 2 * lanes;
load_block(&prev, mem_prev, thread); load_block(&prev, mem_prev, thread);
...@@ -618,10 +616,10 @@ __kernel void argon2_kernel_oneshot_precompute( ...@@ -618,10 +616,10 @@ __kernel void argon2_kernel_oneshot_precompute(
argon2_step_precompute( argon2_step_precompute(
memory, mem_curr, &prev, &tmp, shuffle_buf, &refs, memory, mem_curr, &prev, &tmp, shuffle_buf, &refs,
lanes, segment_blocks, lane_blocks, thread, lane, lanes, segment_blocks, thread,
pass, slice, offset); lane, pass, slice, offset);
++mem_curr; mem_curr += lanes;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
...@@ -636,8 +634,7 @@ void argon2_step( ...@@ -636,8 +634,7 @@ void argon2_step(
__global struct block_g *memory, __global struct block_g *mem_curr, __global struct block_g *memory, __global struct block_g *mem_curr,
struct block_th *prev, struct block_th *tmp, struct block_th *addr, struct block_th *prev, struct block_th *tmp, struct block_th *addr,
__local struct u64_shuffle_buf *shuffle_buf, __local struct u64_shuffle_buf *shuffle_buf,
uint lanes, uint segment_blocks, uint lane_blocks, uint lanes, uint segment_blocks, uint thread, uint *thread_input,
uint thread, uint *thread_input,
uint lane, uint pass, uint slice, uint offset) uint lane, uint pass, uint slice, uint offset)
{ {
uint ref_index, ref_lane; uint ref_index, ref_lane;
...@@ -674,8 +671,8 @@ void argon2_step( ...@@ -674,8 +671,8 @@ void argon2_step(
compute_ref_pos(lanes, segment_blocks, pass, lane, slice, offset, compute_ref_pos(lanes, segment_blocks, pass, lane, slice, offset,
&ref_lane, &ref_index); &ref_lane, &ref_index);
argon2_core(memory, mem_curr, prev, tmp, shuffle_buf, lane_blocks, argon2_core(memory, mem_curr, prev, tmp, shuffle_buf, lanes, thread, pass,
thread, pass, ref_index, ref_lane); ref_index, ref_lane);
} }
__kernel void argon2_kernel_segment( __kernel void argon2_kernel_segment(
...@@ -732,20 +729,20 @@ __kernel void argon2_kernel_segment( ...@@ -732,20 +729,20 @@ __kernel void argon2_kernel_segment(
#endif #endif
__global struct block_g *mem_segment = __global struct block_g *mem_segment =
memory + lane * lane_blocks + slice * segment_blocks; memory + slice * segment_blocks * lanes + lane;
__global struct block_g *mem_prev, *mem_curr; __global struct block_g *mem_prev, *mem_curr;
uint start_offset = 0; uint start_offset = 0;
if (pass == 0) { if (pass == 0) {
if (slice == 0) { if (slice == 0) {
mem_prev = mem_segment + 1; mem_prev = mem_segment + 1 * lanes;
mem_curr = mem_segment + 2; mem_curr = mem_segment + 2 * lanes;
start_offset = 2; start_offset = 2;
} else { } else {
mem_prev = mem_segment - 1; mem_prev = mem_segment - lanes;
mem_curr = mem_segment; mem_curr = mem_segment;
} }
} else { } else {
mem_prev = mem_segment + (slice == 0 ? lane_blocks : 0) - 1; mem_prev = mem_segment + (slice == 0 ? lane_blocks * lanes : 0) - lanes;
mem_curr = mem_segment; mem_curr = mem_segment;
} }
...@@ -753,11 +750,10 @@ __kernel void argon2_kernel_segment( ...@@ -753,11 +750,10 @@ __kernel void argon2_kernel_segment(
for (uint offset = start_offset; offset < segment_blocks; ++offset) { for (uint offset = start_offset; offset < segment_blocks; ++offset) {
argon2_step(memory, mem_curr, &prev, &tmp, &addr, shuffle_buf, argon2_step(memory, mem_curr, &prev, &tmp, &addr, shuffle_buf,
lanes, segment_blocks, lane_blocks, lanes, segment_blocks, thread, &thread_input,
thread, &thread_input,
lane, pass, slice, offset); lane, pass, slice, offset);
++mem_curr; mem_curr += lanes;
} }
} }
...@@ -808,9 +804,9 @@ __kernel void argon2_kernel_oneshot( ...@@ -808,9 +804,9 @@ __kernel void argon2_kernel_oneshot(
} }
#endif #endif
__global struct block_g *mem_lane = memory + lane * lane_blocks; __global struct block_g *mem_lane = memory + lane;
__global struct block_g *mem_prev = mem_lane + 1; __global struct block_g *mem_prev = mem_lane + 1 * lanes;
__global struct block_g *mem_curr = mem_lane + 2; __global struct block_g *mem_curr = mem_lane + 2 * lanes;
load_block(&prev, mem_prev, thread); load_block(&prev, mem_prev, thread);
...@@ -824,11 +820,10 @@ __kernel void argon2_kernel_oneshot( ...@@ -824,11 +820,10 @@ __kernel void argon2_kernel_oneshot(
} }
argon2_step(memory, mem_curr, &prev, &tmp, &addr, shuffle_buf, argon2_step(memory, mem_curr, &prev, &tmp, &addr, shuffle_buf,
lanes, segment_blocks, lane_blocks, lanes, segment_blocks, thread, &thread_input,
thread, &thread_input,
lane, pass, slice, offset); lane, pass, slice, offset);
++mem_curr; mem_curr += lanes;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
......
...@@ -22,39 +22,6 @@ private: ...@@ -22,39 +22,6 @@ private:
uint32_t bestJobsPerBlock; uint32_t bestJobsPerBlock;
public: public:
class PasswordWriter
{
private:
const Argon2Params *params;
Type type;
Version version;
std::uint8_t *dest;
public:
PasswordWriter(ProcessingUnit &parent, std::size_t index = 0);
void moveForward(std::size_t offset);
void moveBackwards(std::size_t offset);
void setPassword(const void *pw, std::size_t pwSize) const;
};
class HashReader
{
private:
const Argon2Params *params;
const std::uint8_t *src;
std::unique_ptr<uint8_t[]> buffer;
public:
HashReader(ProcessingUnit &parent, std::size_t index = 0);
void moveForward(std::size_t offset);
void moveBackwards(std::size_t offset);
const void *getHash() const;
};
std::size_t getBatchSize() const { return runner.getBatchSize(); } std::size_t getBatchSize() const { return runner.getBatchSize(); }
ProcessingUnit( ProcessingUnit(
...@@ -62,6 +29,9 @@ public: ...@@ -62,6 +29,9 @@ public:
const Device *device, std::size_t batchSize, const Device *device, std::size_t batchSize,
bool bySegment = true, bool precomputeRefs = false); bool bySegment = true, bool precomputeRefs = false);
void setPassword(std::size_t index, const void *pw, std::size_t pwSize);
void getHash(std::size_t index, void *hash);
void beginProcessing(); void beginProcessing();
void endProcessing(); void endProcessing();
}; };
......
...@@ -22,7 +22,6 @@ private: ...@@ -22,7 +22,6 @@ private:
cl::Buffer memoryBuffer, refsBuffer; cl::Buffer memoryBuffer, refsBuffer;
cl::Event start, end; cl::Event start, end;
void *memory;
std::uint32_t memorySize; std::uint32_t memorySize;
void precomputeRefs(); void precomputeRefs();
...@@ -38,12 +37,17 @@ public: ...@@ -38,12 +37,17 @@ public:
std::uint32_t getMaxJobsPerBlock() const { return batchSize; } std::uint32_t getMaxJobsPerBlock() const { return batchSize; }
std::uint32_t getBatchSize() const { return batchSize; } std::uint32_t getBatchSize() const { return batchSize; }
void *getMemory() const { return memory; }
KernelRunner(const ProgramContext *programContext, KernelRunner(const ProgramContext *programContext,
const Argon2Params *params, const Device *device, const Argon2Params *params, const Device *device,
std::uint32_t batchSize, bool bySegment, bool precompute); std::uint32_t batchSize, bool bySegment, bool precompute);
void *mapInputMemory(std::uint32_t jobId);
void unmapInputMemory(void *memory);
void *mapOutputMemory(std::uint32_t jobId);
void unmapOutputMemory(void *memory);
void run(std::uint32_t lanesPerBlock, std::uint32_t jobsPerBlock); void run(std::uint32_t lanesPerBlock, std::uint32_t jobsPerBlock);
float finish(); float finish();
}; };
......
...@@ -20,39 +20,6 @@ private: ...@@ -20,39 +20,6 @@ private:
std::uint32_t bestJobsPerBlock; std::uint32_t bestJobsPerBlock;
public: public:
class PasswordWriter
{
private:
const Argon2Params *params;
Type type;
Version version;
std::uint8_t *dest;
public:
PasswordWriter(ProcessingUnit &parent, std::size_t index = 0);
void moveForward(std::size_t offset);
void moveBackwards(std::size_t offset);
void setPassword(const void *pw, std::size_t pwSize) const;
};
class HashReader
{
private:
const Argon2Params *params;
const std::uint8_t *src;
std::unique_ptr<uint8_t[]> buffer;
public:
HashReader(ProcessingUnit &parent, std::size_t index = 0);
void moveForward(std::size_t offset);
void moveBackwards(std::size_t offset);
const void *getHash() const;
};
std::size_t getBatchSize() const { return runner.getBatchSize(); } std::size_t getBatchSize() const { return runner.getBatchSize(); }
ProcessingUnit( ProcessingUnit(
...@@ -60,6 +27,9 @@ public: ...@@ -60,6 +27,9 @@ public:
const Device *device, std::size_t batchSize, const Device *device, std::size_t batchSize,
bool bySegment = true, bool precomputeRefs = false); bool bySegment = true, bool precomputeRefs = false);
void setPassword(std::size_t index, const void *pw, std::size_t pwSize);
void getHash(std::size_t index, void *hash);
void beginProcessing(); void beginProcessing();
void endProcessing(); void endProcessing();
}; };
......
...@@ -423,10 +423,10 @@ template<uint32_t version> ...@@ -423,10 +423,10 @@ template<uint32_t version>
__device__ void argon2_core( __device__ void argon2_core(
struct block_g *memory, struct block_g *mem_curr, struct block_g *memory, struct block_g *mem_curr,
struct block_th *prev, struct block_th *tmp, struct block_th *prev, struct block_th *tmp,
struct u64_shuffle_buf *shuffle_buf, uint32_t lane_blocks, struct u64_shuffle_buf *shuffle_buf, uint32_t lanes,
uint32_t thread, uint32_t pass, uint32_t ref_index, uint32_t ref_lane) uint32_t thread, uint32_t pass, uint32_t ref_index, uint32_t ref_lane)
{ {
struct block_g *mem_ref = memory + ref_lane * lane_blocks + ref_index; struct block_g *mem_ref = memory + ref_index * lanes + ref_lane;
if (version != ARGON2_VERSION_10 && pass != 0) { if (version != ARGON2_VERSION_10 && pass != 0) {
load_block(tmp, mem_curr, thread); load_block(tmp, mem_curr, thread);
...@@ -449,9 +449,8 @@ __device__ void argon2_step_precompute( ...@@ -449,9 +449,8 @@ __device__ void argon2_step_precompute(
struct block_g *memory, struct block_g *mem_curr, struct block_g *memory, struct block_g *mem_curr,
struct block_th *prev, struct block_th *tmp, struct block_th *prev, struct block_th *tmp,
struct u64_shuffle_buf *shuffle_buf, const struct ref **refs, struct u64_shuffle_buf *shuffle_buf, const struct ref **refs,
uint32_t lanes, uint32_t segment_blocks, uint32_t lane_blocks, uint32_t lanes, uint32_t segment_blocks, uint32_t thread,
uint32_t thread, uint32_t lane, uint32_t pass, uint32_t slice, uint32_t lane, uint32_t pass, uint32_t slice, uint32_t offset)
uint32_t offset)
{ {
uint32_t ref_index, ref_lane; uint32_t ref_index, ref_lane;
if (type == ARGON2_I || (type == ARGON2_ID && pass == 0 && if (type == ARGON2_I || (type == ARGON2_ID && pass == 0 &&
...@@ -468,7 +467,7 @@ __device__ void argon2_step_precompute( ...@@ -468,7 +467,7 @@ __device__ void argon2_step_precompute(
&ref_lane, &ref_index); &ref_lane, &ref_index);
} }
argon2_core<version>(memory, mem_curr, prev, tmp, shuffle_buf, lane_blocks, argon2_core<version>(memory, mem_curr, prev, tmp, shuffle_buf, lanes,
thread, pass, ref_index, ref_lane); thread, pass, ref_index, ref_lane);
} }
...@@ -493,20 +492,20 @@ __global__ void argon2_kernel_segment_precompute( ...@@ -493,20 +492,20 @@ __global__ void argon2_kernel_segment_precompute(
struct block_th prev, tmp; struct block_th prev, tmp;
struct block_g *mem_segment = struct block_g *mem_segment =
memory + lane * lane_blocks + slice * segment_blocks; memory + slice * segment_blocks * lanes + lane;
struct block_g *mem_prev, *mem_curr; struct block_g *mem_prev, *mem_curr;
uint32_t start_offset = 0; uint32_t start_offset = 0;
if (pass == 0) { if (pass == 0) {
if (slice == 0) { if (slice == 0) {
mem_prev = mem_segment + 1; mem_prev = mem_segment + 1 * lanes;
mem_curr = mem_segment + 2; mem_curr = mem_segment + 2 * lanes;
start_offset = 2; start_offset = 2;
} else { } else {
mem_prev = mem_segment - 1; mem_prev = mem_segment - lanes;
mem_curr = mem_segment; mem_curr = mem_segment;
} }
} else { } else {
mem_prev = mem_segment + (slice == 0 ? lane_blocks : 0) - 1; mem_prev = mem_segment + (slice == 0 ? lane_blocks * lanes : 0) - lanes;
mem_curr = mem_segment; mem_curr = mem_segment;
} }
...@@ -525,10 +524,9 @@ __global__ void argon2_kernel_segment_precompute( ...@@ -525,10 +524,9 @@ __global__ void argon2_kernel_segment_precompute(
for (uint32_t offset = start_offset; offset < segment_blocks; ++offset) { for (uint32_t offset = start_offset; offset < segment_blocks; ++offset) {
argon2_step_precompute<type, version>( argon2_step_precompute<type, version>(
memory, mem_curr, &prev, &tmp, shuffle_buf, &refs, lanes, memory, mem_curr, &prev, &tmp, shuffle_buf, &refs, lanes,
segment_blocks, lane_blocks, thread, segment_blocks, thread, lane, pass, slice, offset);
lane, pass, slice, offset);
++mem_curr; mem_curr += lanes;
} }
} }
...@@ -551,9 +549,9 @@ __global__ void argon2_kernel_oneshot_precompute( ...@@ -551,9 +549,9 @@ __global__ void argon2_kernel_oneshot_precompute(
struct block_th prev, tmp; struct block_th prev, tmp;
struct block_g *mem_lane = memory + lane * lane_blocks; struct block_g *mem_lane = memory + lane;
struct block_g *mem_prev = mem_lane + 1; struct block_g *mem_prev = mem_lane + 1 * lanes;
struct block_g *mem_curr = mem_lane + 2; struct block_g *mem_curr = mem_lane + 2 * lanes;
load_block(&prev, mem_prev, thread); load_block(&prev, mem_prev, thread);
...@@ -574,10 +572,10 @@ __global__ void argon2_kernel_oneshot_precompute( ...@@ -574,10 +572,10 @@ __global__ void argon2_kernel_oneshot_precompute(
argon2_step_precompute<type, version>( argon2_step_precompute<type, version>(
memory, mem_curr, &prev, &tmp, shuffle_buf, &refs, memory, mem_curr, &prev, &tmp, shuffle_buf, &refs,
lanes, segment_blocks, lane_blocks, thread, lane, lanes, segment_blocks, thread,
pass, slice, offset); lane, pass, slice, offset);
++mem_curr; mem_curr += lanes;
} }
__syncthreads(); __syncthreads();
...@@ -591,9 +589,8 @@ template<uint32_t type, uint32_t version> ...@@ -591,9 +589,8 @@ template<uint32_t type, uint32_t version>
__device__ void argon2_step( __device__ void argon2_step(
struct block_g *memory, struct block_g *mem_curr, struct block_g *memory, struct block_g *mem_curr,
struct block_th *prev, struct block_th *tmp, struct block_th *addr, struct block_th *prev, struct block_th *tmp, struct block_th *addr,
struct u64_shuffle_buf *shuffle_buf, struct u64_shuffle_buf *shuffle_buf, uint32_t lanes,
uint32_t lanes, uint32_t segment_blocks, uint32_t lane_blocks, uint32_t segment_blocks, uint32_t thread, uint32_t *thread_input,
uint32_t thread, uint32_t *thread_input,
uint32_t lane, uint32_t pass, uint32_t slice, uint32_t offset) uint32_t lane, uint32_t pass, uint32_t slice, uint32_t offset)
{ {
uint32_t ref_index, ref_lane; uint32_t ref_index, ref_lane;
...@@ -624,7 +621,7 @@ __device__ void argon2_step( ...@@ -624,7 +621,7 @@ __device__ void argon2_step(
compute_ref_pos(lanes, segment_blocks, pass, lane, slice, offset, compute_ref_pos(lanes, segment_blocks, pass, lane, slice, offset,
&ref_lane, &ref_index); &ref_lane, &ref_index);
argon2_core<version>(memory, mem_curr, prev, tmp, shuffle_buf, lane_blocks, argon2_core<version>(memory, mem_curr, prev, tmp, shuffle_buf, lanes,
thread, pass, ref_index, ref_lane); thread, pass, ref_index, ref_lane);
} }
...@@ -682,20 +679,20 @@ __global__ void argon2_kernel_segment( ...@@ -682,20 +679,20 @@ __global__ void argon2_kernel_segment(
} }
struct block_g *mem_segment = struct block_g *mem_segment =
memory + lane * lane_blocks + slice * segment_blocks; memory + slice * segment_blocks * lanes + lane;
struct block_g *mem_prev, *mem_curr; struct block_g *mem_prev, *mem_curr;
uint32_t start_offset = 0; uint32_t start_offset = 0;
if (pass == 0) { if (pass == 0) {
if (slice == 0) { if (slice == 0) {
mem_prev = mem_segment + 1; mem_prev = mem_segment + 1 * lanes;
mem_curr = mem_segment + 2; mem_curr = mem_segment + 2 * lanes;
start_offset = 2; start_offset = 2;
} else { } else {
mem_prev = mem_segment - 1; mem_prev = mem_segment - lanes;
mem_curr = mem_segment; mem_curr = mem_segment;
} }
} else { } else {
mem_prev = mem_segment + (slice == 0 ? lane_blocks : 0) - 1; mem_prev = mem_segment + (slice == 0 ? lane_blocks * lanes : 0) - lanes;
mem_curr = mem_segment; mem_curr = mem_segment;
} }
...@@ -704,11 +701,10 @@ __global__ void argon2_kernel_segment( ...@@ -704,11 +701,10 @@ __global__ void argon2_kernel_segment(
for (uint32_t offset = start_offset; offset < segment_blocks; ++offset) { for (uint32_t offset = start_offset; offset < segment_blocks; ++offset) {
argon2_step<type, version>( argon2_step<type, version>(
memory, mem_curr, &prev, &tmp, &addr, shuffle_buf, memory, mem_curr, &prev, &tmp, &addr, shuffle_buf,
lanes, segment_blocks, lane_blocks, lanes, segment_blocks, thread, &thread_input,
thread, &thread_input,
lane, pass, slice, offset); lane, pass, slice, offset);
++mem_curr; mem_curr += lanes;
} }
} }
...@@ -759,9 +755,9 @@ __global__ void argon2_kernel_oneshot( ...@@ -759,9 +755,9 @@ __global__ void argon2_kernel_oneshot(
} }
} }
struct block_g *mem_lane = memory + lane * lane_blocks; struct block_g *mem_lane = memory + lane;
struct block_g *mem_prev = mem_lane + 1; struct block_g *mem_prev = mem_lane + 1 * lanes;
struct block_g *mem_curr = mem_lane + 2; struct block_g *mem_curr = mem_lane + 2 * lanes;
load_block(&prev, mem_prev, thread); load_block(&prev, mem_prev, thread);
...@@ -776,11 +772,10 @@ __global__ void argon2_kernel_oneshot( ...@@ -776,11 +772,10 @@ __global__ void argon2_kernel_oneshot(
argon2_step<type, version>( argon2_step<type, version>(
memory, mem_curr, &prev, &tmp, &addr, shuffle_buf, memory, mem_curr, &prev, &tmp, &addr, shuffle_buf,
lanes, segment_blocks, lane_blocks, lanes, segment_blocks, thread, &thread_input,
thread, &thread_input,
lane, pass, slice, offset); lane, pass, slice, offset);
++mem_curr; mem_curr += lanes;
} }
__syncthreads(); __syncthreads();
......
...@@ -31,14 +31,9 @@ ProcessingUnit::ProcessingUnit( ...@@ -31,14 +31,9 @@ ProcessingUnit::ProcessingUnit(
CudaException::check(cudaSetDevice(device->getDeviceIndex())); CudaException::check(cudaSetDevice(device->getDeviceIndex()));
} }
auto memory = static_cast<std::uint8_t *>(runner.getMemory());
/* pre-fill first blocks with pseudo-random data: */ /* pre-fill first blocks with pseudo-random data: */
for (std::size_t i = 0; i < batchSize; i++) { for (std::size_t i = 0; i < batchSize; i++) {
params->fillFirstBlocks(memory, NULL, 0, setPassword(i, NULL, 0);
programContext->getArgon2Type(),
programContext->getArgon2Version());
memory += params->getMemorySize();
} }
if (runner.getMaxLanesPerBlock() > runner.getMinLanesPerBlock()) { if (runner.getMaxLanesPerBlock() > runner.getMinLanesPerBlock()) {
...@@ -118,55 +113,22 @@ ProcessingUnit::ProcessingUnit( ...@@ -118,55 +113,22 @@ ProcessingUnit::ProcessingUnit(
} }
} }
ProcessingUnit::PasswordWriter::PasswordWriter( void ProcessingUnit::setPassword(std::size_t index, const void *pw,
ProcessingUnit &parent, std::size_t index) std::size_t pwSize)
: params(parent.params),
type(parent.programContext->getArgon2Type()),
version(parent.programContext->getArgon2Version()),
dest(static_cast<std::uint8_t *>(parent.runner.getMemory()))
{
dest += index * params->getMemorySize();
}
void ProcessingUnit::PasswordWriter::moveForward(std::size_t offset)
{
dest += offset * params->getMemorySize();
}
void ProcessingUnit::PasswordWriter::moveBackwards(std::size_t offset)
{
dest -= offset * params->getMemorySize();
}
void ProcessingUnit::PasswordWriter::setPassword(
const void *pw, std::size_t pwSize) const
{ {
params->fillFirstBlocks(dest, pw, pwSize, type, version); auto memory = static_cast<std::uint8_t *>(runner.getMemory());
} memory += index * params->getMemorySize();
params->fillFirstBlocks(memory, pw, pwSize,
ProcessingUnit::HashReader::HashReader( programContext->getArgon2Type(),
ProcessingUnit &parent, std::size_t index) programContext->getArgon2Version());
: params(parent.params),
src(static_cast<const std::uint8_t *>(parent.runner.getMemory())),
buffer(new std::uint8_t[params->getOutputLength()])
{
src += index * params->getMemorySize();
}
void ProcessingUnit::HashReader::moveForward(std::size_t offset)
{
src += offset * params->getMemorySize();
}