Commit 527dd6da authored by Ondrej Mosnáček's avatar Ondrej Mosnáček

[CUDA,OpenCL] Fix shared memory races

parent 34379bef
Pipeline #12636017 failed with stages
in 15 minutes and 43 seconds
......@@ -540,7 +540,8 @@ __kernel void argon2_kernel_segment_precompute(
{
uint job_id = get_global_id(1);
uint lane = get_global_id(0) / THREADS_PER_LANE;
uint warp = get_local_id(0) / THREADS_PER_LANE;
uint warp = (get_local_id(1) * get_local_size(0) + get_local_id(0))
/ THREADS_PER_LANE;
uint thread = get_local_id(0) % THREADS_PER_LANE;
__local struct u64_shuffle_buf *shuffle_buf = &shuffle_bufs[warp];
......@@ -598,7 +599,7 @@ __kernel void argon2_kernel_oneshot_precompute(
{
uint job_id = get_global_id(1);
uint lane = get_global_id(0) / THREADS_PER_LANE;
uint warp = get_local_id(0) / THREADS_PER_LANE;
uint warp = get_local_id(1) * lanes + get_local_id(0) / THREADS_PER_LANE;
uint thread = get_local_id(0) % THREADS_PER_LANE;
__local struct u64_shuffle_buf *shuffle_buf = &shuffle_bufs[warp];
......@@ -699,7 +700,8 @@ __kernel void argon2_kernel_segment(
{
uint job_id = get_global_id(1);
uint lane = get_global_id(0) / THREADS_PER_LANE;
uint warp = get_local_id(0) / THREADS_PER_LANE;
uint warp = (get_local_id(1) * get_local_size(0) + get_local_id(0))
/ THREADS_PER_LANE;
uint thread = get_local_id(0) % THREADS_PER_LANE;
__local struct u64_shuffle_buf *shuffle_buf = &shuffle_bufs[warp];
......@@ -781,7 +783,7 @@ __kernel void argon2_kernel_oneshot(
{
uint job_id = get_global_id(1);
uint lane = get_global_id(0) / THREADS_PER_LANE;
uint warp = get_local_id(0) / THREADS_PER_LANE;
uint warp = get_local_id(1) * lanes + get_local_id(0) / THREADS_PER_LANE;
uint thread = get_local_id(0) % THREADS_PER_LANE;
__local struct u64_shuffle_buf *shuffle_buf = &shuffle_bufs[warp];
......
......@@ -470,7 +470,8 @@ __global__ void argon2_kernel_segment_precompute(
uint32_t pass, uint32_t slice)
{
extern __shared__ struct u64_shuffle_buf shuffle_bufs[];
struct u64_shuffle_buf *shuffle_buf = &shuffle_bufs[threadIdx.y];
struct u64_shuffle_buf *shuffle_buf =
&shuffle_bufs[blockDim.y * threadIdx.z + threadIdx.y];
uint32_t job_id = blockIdx.z * blockDim.z + threadIdx.z;
uint32_t lane = blockIdx.y * blockDim.y + threadIdx.y;
......@@ -528,7 +529,8 @@ __global__ void argon2_kernel_oneshot_precompute(
uint32_t lanes, uint32_t segment_blocks)
{
extern __shared__ struct u64_shuffle_buf shuffle_bufs[];
struct u64_shuffle_buf *shuffle_buf = &shuffle_bufs[threadIdx.y];
struct u64_shuffle_buf *shuffle_buf =
&shuffle_bufs[lanes * threadIdx.z + threadIdx.y];
uint32_t job_id = blockIdx.z * blockDim.z + threadIdx.z;
uint32_t lane = threadIdx.y;
......@@ -623,7 +625,8 @@ __global__ void argon2_kernel_segment(
uint32_t segment_blocks, uint32_t pass, uint32_t slice)
{
extern __shared__ struct u64_shuffle_buf shuffle_bufs[];
struct u64_shuffle_buf *shuffle_buf = &shuffle_bufs[threadIdx.y];
struct u64_shuffle_buf *shuffle_buf =
&shuffle_bufs[blockDim.y * threadIdx.z + threadIdx.y];
uint32_t job_id = blockIdx.z * blockDim.z + threadIdx.z;
uint32_t lane = blockIdx.y * blockDim.y + threadIdx.y;
......@@ -706,7 +709,8 @@ __global__ void argon2_kernel_oneshot(
uint32_t segment_blocks)
{
extern __shared__ struct u64_shuffle_buf shuffle_bufs[];
struct u64_shuffle_buf *shuffle_buf = &shuffle_bufs[threadIdx.y];
struct u64_shuffle_buf *shuffle_buf =
&shuffle_bufs[lanes * threadIdx.z + threadIdx.y];
uint32_t job_id = blockIdx.z * blockDim.z + threadIdx.z;
uint32_t lane = threadIdx.y;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment