Commit 2a3e26bc authored by Ondrej Mosnáček's avatar Ondrej Mosnáček

[CUDA,OpenCL] Simplify the copying code

parent 98e5e635
Pipeline #52545114 failed with stages
in 7 minutes and 22 seconds
......@@ -24,7 +24,7 @@ private:
bool bySegment;
bool precompute;
cudaEvent_t start, end;
cudaEvent_t start, end, kernelStart, kernelEnd;
cudaStream_t stream;
void *memory;
void *refs;
......
......@@ -22,7 +22,7 @@ private:
cl::CommandQueue queue;
cl::Kernel kernel;
cl::Buffer memoryBuffer, refsBuffer;
cl::Event start, end;
cl::Event start, end, kernelStart, kernelEnd;
std::size_t memorySize;
......
......@@ -802,8 +802,8 @@ KernelRunner::KernelRunner(uint32_t type, uint32_t version, uint32_t passes,
size_t batchSize, bool bySegment, bool precompute)
: type(type), version(version), passes(passes), lanes(lanes),
segmentBlocks(segmentBlocks), batchSize(batchSize), bySegment(bySegment),
precompute(precompute), stream(nullptr), memory(nullptr),
refs(nullptr), start(nullptr), end(nullptr),
precompute(precompute), stream(), memory(), refs(),
start(), end(), kernelStart(), kernelEnd(),
blocksIn(new uint8_t[batchSize * lanes * 2 * ARGON2_BLOCK_SIZE]),
blocksOut(new uint8_t[batchSize * lanes * ARGON2_BLOCK_SIZE])
{
......@@ -820,6 +820,8 @@ KernelRunner::KernelRunner(uint32_t type, uint32_t version, uint32_t passes,
CudaException::check(cudaEventCreate(&start));
CudaException::check(cudaEventCreate(&end));
CudaException::check(cudaEventCreate(&kernelStart));
CudaException::check(cudaEventCreate(&kernelEnd));
CudaException::check(cudaStreamCreate(&stream));
......@@ -905,32 +907,11 @@ void KernelRunner::copyInputBlocks()
* ARGON2_SYNC_POINTS * ARGON2_BLOCK_SIZE;
size_t copySize = lanes * 2 * ARGON2_BLOCK_SIZE;
#ifndef NDEBUG
cudaEvent_t start, end;
CudaException::check(cudaEventCreate(&start));
CudaException::check(cudaEventCreate(&end));
cudaEventRecord(start, stream);
#endif
CudaException::check(cudaMemcpy2DAsync(
memory, jobSize,
blocksIn.get(), copySize,
copySize, batchSize, cudaMemcpyHostToDevice,
stream));
#ifndef NDEBUG
cudaEventRecord(end, stream);
#endif
CudaException::check(cudaStreamSynchronize(stream));
#ifndef NDEBUG
float time = 0.0;
CudaException::check(cudaEventElapsedTime(&time, start, end));
std::cerr << "[INFO] Copy to device took " << time << " ms." << std::endl;
#endif
}
void KernelRunner::copyOutputBlocks()
......@@ -940,32 +921,11 @@ void KernelRunner::copyOutputBlocks()
size_t copySize = lanes * ARGON2_BLOCK_SIZE;
uint8_t *mem = static_cast<uint8_t *>(memory);
#ifndef NDEBUG
cudaEvent_t start, end;
CudaException::check(cudaEventCreate(&start));
CudaException::check(cudaEventCreate(&end));
cudaEventRecord(start, stream);
#endif
CudaException::check(cudaMemcpy2DAsync(
blocksOut.get(), copySize,
mem + (jobSize - copySize), jobSize,
copySize, batchSize, cudaMemcpyDeviceToHost,
stream));
#ifndef NDEBUG
cudaEventRecord(end, stream);
#endif
CudaException::check(cudaStreamSynchronize(stream));
#ifndef NDEBUG
float time = 0.0;
CudaException::check(cudaEventElapsedTime(&time, start, end));
std::cerr << "[INFO] Copy from device took " << time << " ms." << std::endl;
#endif
}
void KernelRunner::runKernelSegment(uint32_t lanesPerBlock,
......@@ -1131,9 +1091,11 @@ void KernelRunner::runKernelOneshot(uint32_t lanesPerBlock,
void KernelRunner::run(uint32_t lanesPerBlock, uint32_t jobsPerBlock)
{
CudaException::check(cudaEventRecord(start, stream));
copyInputBlocks();
CudaException::check(cudaEventRecord(start, stream));
CudaException::check(cudaEventRecord(kernelStart, stream));
if (bySegment) {
for (uint32_t pass = 0; pass < passes; pass++) {
......@@ -1147,17 +1109,27 @@ void KernelRunner::run(uint32_t lanesPerBlock, uint32_t jobsPerBlock)
CudaException::check(cudaGetLastError());
CudaException::check(cudaEventRecord(kernelEnd, stream));
copyOutputBlocks();
CudaException::check(cudaEventRecord(end, stream));
}
float KernelRunner::finish()
{
float time = 0.0;
CudaException::check(cudaStreamSynchronize(stream));
copyOutputBlocks();
#ifndef NDEBUG
CudaException::check(cudaEventElapsedTime(&time, start, kernelStart));
std::cerr << "[INFO] Copy to device took " << time << " ms." << std::endl;
float time = 0.0;
CudaException::check(cudaEventElapsedTime(&time, start, end));
CudaException::check(cudaEventElapsedTime(&time, kernelEnd, end));
std::cerr << "[INFO] Copy from device took " << time << " ms." << std::endl;
#endif
CudaException::check(cudaEventElapsedTime(&time, kernelStart, kernelEnd));
return time;
}
......
......@@ -134,26 +134,10 @@ void KernelRunner::copyInputBlocks()
std::size_t jobSize = params->getMemorySize();
std::size_t copySize = params->getLanes() * 2 * ARGON2_BLOCK_SIZE;
#ifndef NDEBUG
cl::Event start, end;
queue.enqueueMarker(&start);
#endif
queue.enqueueWriteBufferRect(memoryBuffer, false,
makeSize3(0, 0, 0), makeSize3(0, 0, 0),
makeSize3(copySize, batchSize, 1),
jobSize, 0, copySize, 0, blocksIn.get());
#ifndef NDEBUG
queue.enqueueMarker(&end);
#endif
queue.finish();
#ifndef NDEBUG
std::cerr << "[INFO] Copy to device took " << getDurationInMs(start, end)
<< " ms." << std::endl;
#endif
}
void KernelRunner::copyOutputBlocks()
......@@ -161,27 +145,11 @@ void KernelRunner::copyOutputBlocks()
std::size_t jobSize = params->getMemorySize();
std::size_t copySize = params->getLanes() * ARGON2_BLOCK_SIZE;
#ifndef NDEBUG
cl::Event start, end;
queue.enqueueMarker(&start);
#endif
queue.enqueueReadBufferRect(memoryBuffer, false,
makeSize3(jobSize - copySize, 0, 0),
makeSize3(0, 0, 0),
makeSize3(copySize, batchSize, 1),
jobSize, 0, copySize, 0, blocksOut.get());
#ifndef NDEBUG
queue.enqueueMarker(&end);
#endif
queue.finish();
#ifndef NDEBUG
std::cerr << "[INFO] Copy from device took " << getDurationInMs(start, end)
<< " ms." << std::endl;
#endif
}
void KernelRunner::run(std::uint32_t lanesPerBlock, std::uint32_t jobsPerBlock)
......@@ -206,9 +174,11 @@ void KernelRunner::run(std::uint32_t lanesPerBlock, std::uint32_t jobsPerBlock)
cl::NDRange globalRange { THREADS_PER_LANE * lanes, batchSize };
cl::NDRange localRange { THREADS_PER_LANE * lanesPerBlock, jobsPerBlock };
queue.enqueueMarker(&start);
copyInputBlocks();
queue.enqueueMarker(&start);
queue.enqueueMarker(&kernelStart);
std::size_t shmemSize = THREADS_PER_LANE * lanesPerBlock * jobsPerBlock
* sizeof(cl_uint) * 2;
......@@ -227,6 +197,10 @@ void KernelRunner::run(std::uint32_t lanesPerBlock, std::uint32_t jobsPerBlock)
globalRange, localRange);
}
queue.enqueueMarker(&kernelEnd);
copyOutputBlocks();
queue.enqueueMarker(&end);
}
......@@ -234,7 +208,13 @@ float KernelRunner::finish()
{
end.wait();
copyOutputBlocks();
#ifndef NDEBUG
std::cerr << "[INFO] Copy to device took "
<< getDurationInMs(start, kernelStart) << " ms." << std::endl;
std::cerr << "[INFO] Copy from device took "
<< getDurationInMs(kernelEnd, end) << " ms." << std::endl;
#endif
return getDurationInMs(start, end);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment