Commit b8993ec3 authored by Adam P. Goucher's avatar Adam P. Goucher

GPU code generation

parent 9d190dde
#define ADVANCE_TILE_64(a, b, tmp, dst) { \
uint64_cu al = ROTL64(a, 1); \
uint64_cu ar = ROTR64(a, 1); \
uint64_cu xor2 = al ^ ar; \
(dst)[threadIdx.x] = xor2 ^ a; \
__syncthreads(); \
uint64_cu uda = (dst)[u] & (dst)[d]; \
uint64_cu udx = (dst)[u] ^ (dst)[d]; \
uint64_cu sv = (xor2 & udx) | uda; \
uint64_cu pt8 = xor2 ^ udx; \
uint64_cu sh = al & ar; \
(tmp)[threadIdx.x] = sh | (a & xor2); \
__syncthreads(); \
uint64_cu pt4 = ((tmp)[u] ^ (tmp)[d]) ^ (sh ^ sv); \
uint64_cu xc3 = ((tmp)[u] | (tmp)[d]) ^ (sh | sv); \
b = pt4 & xc3 & (pt8 | a); \
}
......@@ -9,23 +9,7 @@
* and stores the result in register 'b'. The pointers 'tmp' and 'dst'
* must refer to disjoint 512-byte blocks of shared memory.
*/
#define ADVANCE_TILE_64(a, b, tmp, dst) { \
uint64_cu al = ROTL64(a, 1); \
uint64_cu ar = ROTR64(a, 1); \
uint64_cu xor2 = al ^ ar; \
(dst)[threadIdx.x] = xor2 ^ a; \
__syncthreads(); \
uint64_cu uda = (dst)[u] & (dst)[d]; \
uint64_cu udx = (dst)[u] ^ (dst)[d]; \
uint64_cu sv = (xor2 & udx) | uda; \
uint64_cu pt8 = xor2 ^ udx; \
uint64_cu sh = al & ar; \
(tmp)[threadIdx.x] = sh | (a & xor2); \
__syncthreads(); \
uint64_cu pt4 = ((tmp)[u] ^ (tmp)[d]) ^ (sh ^ sv); \
uint64_cu xc3 = ((tmp)[u] | (tmp)[d]) ^ (sh | sv); \
b = pt4 & xc3 & (pt8 | a); \
}
#include "../avxlife/lifelogic/iterators_gpu.h"
#define ZERO_SIX_TILES multiverse[threadnum + 128] = 0; multiverse[threadnum + 192] = 0; \
multiverse[threadnum + 256] = 0; multiverse[threadnum + 320] = 0; \
......
......@@ -10,6 +10,24 @@ def mantissa(rulestring):
def create_rule(rulestring):
with open('iterators_gpu.h', 'w') as f:
f.write('''#define ADVANCE_TILE_64(a, b, tmp, dst) { \\
uint64_cu al = ROTL64(a, 1); \\
uint64_cu ar = ROTR64(a, 1); \\
uint64_cu xor2 = al ^ ar; \\
(dst)[threadIdx.x] = xor2 ^ a; \\
__syncthreads(); \\
uint64_cu uda = (dst)[u] & (dst)[d]; \\
uint64_cu udx = (dst)[u] ^ (dst)[d]; \\
uint64_cu sv = (xor2 & udx) | uda; \\
uint64_cu pt8 = xor2 ^ udx; \\
uint64_cu sh = al & ar; \\
(tmp)[threadIdx.x] = sh | (a & xor2); \\
__syncthreads(); \\
uint64_cu pt4 = ((tmp)[u] ^ (tmp)[d]) ^ (sh ^ sv); \\
uint64_cu xc3 = ((tmp)[u] | (tmp)[d]) ^ (sh | sv); \\
b = pt4 & xc3 & (pt8 | a); \\\n}\n\n''')
with open('iterators_%s.h' % rulestring, 'w') as f:
f.write('#pragma once\n')
f.write('#include <stdint.h>\n')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment