Commit bede9e8a authored by Ondrej Mosnáček's avatar Ondrej Mosnáček

[CUDA,OpenCL] Further simplify the code

parent a2f6d435
Pipeline #12407168 passed with stages
in 21 minutes and 23 seconds
......@@ -292,12 +292,12 @@ void transpose(struct block_th *block, uint thread,
{
uint thread_group = (thread & 0x0C) >> 2;
for (uint i = 1; i < QWORDS_PER_THREAD; i++) {
uint src_group = thread_group ^ i;
uint thr = (src_group << 2) | (thread & 0x13);
uint thr = (i << 2) ^ thread;
uint idx = thread_group ^ i;
ulong v = block_th_get(block, src_group);
ulong v = block_th_get(block, idx);
v = u64_shuffle(v, thr, thread, buf);
block_th_set(block, src_group, v);
block_th_set(block, idx, v);
}
}
......
......@@ -192,12 +192,12 @@ __device__ void transpose(struct block_th *block, uint32_t thread,
{
uint32_t thread_group = (thread & 0x0C) >> 2;
for (uint32_t i = 1; i < QWORDS_PER_THREAD; i++) {
uint32_t src_group = thread_group ^ i;
uint32_t thr = (src_group << 2) | (thread & 0x13);
uint32_t thr = (i << 2) ^ thread;
uint32_t idx = thread_group ^ i;
uint64_t v = block_th_get(block, src_group);
uint64_t v = block_th_get(block, idx);
v = u64_shuffle(v, thr, thread, buf);
block_th_set(block, src_group, v);
block_th_set(block, idx, v);
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment