Commit 7eb59b0c authored by Ondrej Mosnáček's avatar Ondrej Mosnáček

[CUDA,OpenCL] Some more overflow sanitization

parent c39c909f
Pipeline #12281757 passed with stages
in 27 minutes and 46 seconds
......@@ -550,7 +550,7 @@ __kernel void argon2_kernel_segment_precompute(
uint lane_blocks = ARGON2_SYNC_POINTS * segment_blocks;
/* select job's memory region: */
memory += job_id * lanes * lane_blocks;
memory += (size_t)job_id * lanes * lane_blocks;
struct block_th prev, tmp;
......@@ -608,7 +608,7 @@ __kernel void argon2_kernel_oneshot_precompute(
uint lane_blocks = ARGON2_SYNC_POINTS * segment_blocks;
/* select job's memory region: */
memory += job_id * lanes * lane_blocks;
memory += (size_t)job_id * lanes * lane_blocks;
struct block_th prev, tmp;
......@@ -709,7 +709,7 @@ __kernel void argon2_kernel_segment(
uint lane_blocks = ARGON2_SYNC_POINTS * segment_blocks;
/* select job's memory region: */
memory += job_id * lanes * lane_blocks;
memory += (size_t)job_id * lanes * lane_blocks;
struct block_th prev, addr, tmp;
uint thread_input;
......@@ -791,7 +791,7 @@ __kernel void argon2_kernel_oneshot(
uint lane_blocks = ARGON2_SYNC_POINTS * segment_blocks;
/* select job's memory region: */
memory += job_id * lanes * lane_blocks;
memory += (size_t)job_id * lanes * lane_blocks;
struct block_th prev, addr, tmp;
uint thread_input;
......
......@@ -485,7 +485,7 @@ __global__ void argon2_kernel_segment_precompute(
uint32_t lane_blocks = ARGON2_SYNC_POINTS * segment_blocks;
/* select job's memory region: */
memory += job_id * lanes * lane_blocks;
memory += (size_t)job_id * lanes * lane_blocks;
struct block_th prev, tmp;
......@@ -543,7 +543,7 @@ __global__ void argon2_kernel_oneshot_precompute(
uint32_t lane_blocks = ARGON2_SYNC_POINTS * segment_blocks;
/* select job's memory region: */
memory += job_id * lanes * lane_blocks;
memory += (size_t)job_id * lanes * lane_blocks;
struct block_th prev, tmp;
......@@ -638,7 +638,7 @@ __global__ void argon2_kernel_segment(
uint32_t lane_blocks = ARGON2_SYNC_POINTS * segment_blocks;
/* select job's memory region: */
memory += job_id * lanes * lane_blocks;
memory += (size_t)job_id * lanes * lane_blocks;
struct block_th prev, addr, tmp;
uint32_t thread_input;
......@@ -721,7 +721,7 @@ __global__ void argon2_kernel_oneshot(
uint32_t lane_blocks = ARGON2_SYNC_POINTS * segment_blocks;
/* select job's memory region: */
memory += job_id * lanes * lane_blocks;
memory += (size_t)job_id * lanes * lane_blocks;
struct block_th prev, addr, tmp;
uint32_t thread_input;
......@@ -829,7 +829,7 @@ KernelRunner::KernelRunner(uint32_t type, uint32_t version, uint32_t passes,
? lanes * (ARGON2_SYNC_POINTS / 2)
: passes * lanes * ARGON2_SYNC_POINTS;
uint32_t refsSize = segments * segmentBlocks * sizeof(struct ref);
size_t refsSize = segments * segmentBlocks * sizeof(struct ref);
#ifndef NDEBUG
std::cerr << "[INFO] Allocating " << refsSize << " bytes for refs..."
......@@ -857,7 +857,7 @@ void KernelRunner::precomputeRefs()
dim3 blocks = dim3(1, segments * segmentAddrBlocks);
dim3 threads = dim3(THREADS_PER_LANE);
uint32_t shmemSize = sizeof(struct u64_shuffle_buf);
size_t shmemSize = sizeof(struct u64_shuffle_buf);
if (type == ARGON2_I) {
argon2_precompute_kernel<ARGON2_I>
<<<blocks, threads, shmemSize, stream>>>(
......@@ -890,10 +890,10 @@ KernelRunner::~KernelRunner()
void KernelRunner::writeInputMemory(uint32_t jobId, const void *buffer)
{
std::size_t memorySize = lanes * segmentBlocks * ARGON2_SYNC_POINTS
* ARGON2_BLOCK_SIZE;
std::size_t size = lanes * 2 * ARGON2_BLOCK_SIZE;
std::size_t offset = memorySize * static_cast<size_t>(jobId);
std::size_t memorySize = static_cast<size_t>(lanes) * segmentBlocks
* ARGON2_SYNC_POINTS * ARGON2_BLOCK_SIZE;
std::size_t size = static_cast<size_t>(lanes) * 2 * ARGON2_BLOCK_SIZE;
std::size_t offset = memorySize * jobId;
auto mem = static_cast<uint8_t *>(memory) + offset;
CudaException::check(cudaMemcpyAsync(mem, buffer, size,
cudaMemcpyHostToDevice, stream));
......@@ -902,10 +902,10 @@ void KernelRunner::writeInputMemory(uint32_t jobId, const void *buffer)
void KernelRunner::readOutputMemory(uint32_t jobId, void *buffer)
{
std::size_t memorySize = lanes * segmentBlocks * ARGON2_SYNC_POINTS
* ARGON2_BLOCK_SIZE;
std::size_t size = lanes * ARGON2_BLOCK_SIZE;
std::size_t offset = memorySize * static_cast<size_t>(jobId + 1) - size;
std::size_t memorySize = static_cast<size_t>(lanes) * segmentBlocks
* ARGON2_SYNC_POINTS * ARGON2_BLOCK_SIZE;
std::size_t size = static_cast<size_t>(lanes) * ARGON2_BLOCK_SIZE;
std::size_t offset = memorySize * (jobId + 1) - size;
auto mem = static_cast<uint8_t *>(memory) + offset;
CudaException::check(cudaMemcpyAsync(buffer, mem, size,
cudaMemcpyDeviceToHost, stream));
......
......@@ -44,7 +44,7 @@ KernelRunner::KernelRunner(const ProgramContext *programContext,
? lanes * (ARGON2_SYNC_POINTS / 2)
: passes * lanes * ARGON2_SYNC_POINTS;
std::uint32_t refsSize = segments * segmentBlocks * sizeof(cl_uint) * 2;
std::size_t refsSize = segments * segmentBlocks * sizeof(cl_uint) * 2;
#ifndef NDEBUG
std::cerr << "[INFO] Allocating " << refsSize << " bytes for refs..."
......@@ -94,7 +94,7 @@ void KernelRunner::precomputeRefs()
? lanes * (ARGON2_SYNC_POINTS / 2)
: passes * lanes * ARGON2_SYNC_POINTS;
std::uint32_t shmemSize = THREADS_PER_LANE * sizeof(cl_uint) * 2;
std::size_t shmemSize = THREADS_PER_LANE * sizeof(cl_uint) * 2;
cl::Kernel kernel = cl::Kernel(programContext->getProgram(),
"argon2_precompute_kernel");
......@@ -126,7 +126,8 @@ void KernelRunner::unmapInputMemory(void *memory)
void *KernelRunner::mapOutputMemory(std::uint32_t jobId)
{
std::size_t memorySize = params->getMemorySize();
std::size_t mappedSize = params->getLanes() * ARGON2_BLOCK_SIZE;
std::size_t mappedSize = static_cast<std::size_t>(params->getLanes())
* ARGON2_BLOCK_SIZE;
std::size_t mappedOffset = memorySize * (jobId + 1) - mappedSize;
return queue.enqueueMapBuffer(memoryBuffer, true, CL_MAP_READ,
mappedOffset, mappedSize);
......@@ -161,9 +162,8 @@ void KernelRunner::run(std::uint32_t lanesPerBlock, std::uint32_t jobsPerBlock)
queue.enqueueMarker(&start);
std::uint32_t shmemSize =
THREADS_PER_LANE * lanesPerBlock * jobsPerBlock *
sizeof(cl_uint) * 2;
std::size_t shmemSize = THREADS_PER_LANE * lanesPerBlock * jobsPerBlock
* sizeof(cl_uint) * 2;
kernel.setArg<cl::LocalSpaceArg>(0, { shmemSize });
if (bySegment) {
for (std::uint32_t pass = 0; pass < passes; pass++) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment