Commit 9b74184b authored by Ondrej Mosnáček's avatar Ondrej Mosnáček

[OpenCL] Use amd_bitalign() for rotr64 if possible

This speeds up Argon2 by about 1-1.5% on AMD GPUs.
parent 88956724
Pipeline #12128899 passed with stages
in 40 minutes and 27 seconds
......@@ -159,10 +159,29 @@ void store_block(__global struct block_g *dst, const struct block_th *src,
dst->data[3 * THREADS_PER_LANE + thread] = src->d;
}
#ifdef cl_amd_media_ops
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
ulong rotr64(ulong x, ulong n)
{
uint lo = u64_lo(x);
uint hi = u64_hi(x);
uint r_lo, r_hi;
if (n < 32) {
r_lo = amd_bitalign(hi, lo, (uint)n);
r_hi = amd_bitalign(lo, hi, (uint)n);
} else {
r_lo = amd_bitalign(lo, hi, (uint)n - 32);
r_hi = amd_bitalign(hi, lo, (uint)n - 32);
}
return u64_build(r_hi, r_lo);
}
#else
ulong rotr64(ulong x, ulong n)
{
return rotate(x, 64 - n);
}
#endif
ulong f(ulong x, ulong y)
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment