Commit c2ed21df authored by Adam P. Goucher's avatar Adam P. Goucher

Split huge file into smaller files

parent 4e0908cc
#pragma once
#include "iterators.h"
#include <iostream>
#define ZERO_SIX_TILES multiverse[threadnum + 128] = 0; multiverse[threadnum + 192] = 0; \
multiverse[threadnum + 256] = 0; multiverse[threadnum + 320] = 0; \
multiverse[threadnum + 384] = 0; multiverse[threadnum + 448] = 0; threadnum += 384
__global__ void copyhashes_C1(uint64_cu *multiverse, uint32_cu *univec, uint64_cu *hashes, uint32_cu offset, bool initial) {
// Initialise memory:
uint32_cu hashnum = blockIdx.x + offset;
uint32_cu uniidx = univec[blockIdx.x];
uint64_cu b = 0x4000600000ull + hashnum;
/*
Memory map:
0x0000 -- 0x0003: header (lowest 19 bits = index; middle 2 bits = usize; upper 11 bits = gencount / 6)
0x0004 -- 0x01ff: flags for each of 127 tiles
0x0200 -- 0xffff: 127 tiles
*/
int usize = (multiverse[uniidx << 13] >> 19) & 3;
if (initial) { usize = 3; }
__syncthreads();
uint32_cu threadnum = (uniidx << 13) + threadIdx.x;
if (threadIdx.x) { b = 0; }
multiverse[threadnum] = b; b = 0;
// Copy SHA-256 hash:
if ((threadIdx.x >= 24) && (threadIdx.x < 40)) {
b = hashes[(hashnum << 2) + (threadIdx.x >> 2) - 6];
b = (b >> (16 * (threadIdx.x & 3)));
b = ((b & 0x00ff) << 32) | ((b & 0xff00) << 16);
}
#ifdef SKIP_18_GENS
// Advance by 18 generations:
__shared__ uint64_cu tmp1[64];
__shared__ uint64_cu tmp2[64];
uint32_cu u = (threadIdx.x + 1) & 63;
uint32_cu d = (threadIdx.x + 63) & 63;
ADVANCE_TILE_64(b, b, tmp1, tmp2)
ADVANCE_TILE_64(b, b, tmp1, tmp2)
ADVANCE_TILE_64(b, b, tmp1, tmp2)
ADVANCE_TILE_64(b, b, tmp1, tmp2)
ADVANCE_TILE_64(b, b, tmp1, tmp2)
ADVANCE_TILE_64(b, b, tmp1, tmp2)
ADVANCE_TILE_64(b, b, tmp1, tmp2)
ADVANCE_TILE_64(b, b, tmp1, tmp2)
ADVANCE_TILE_64(b, b, tmp1, tmp2)
ADVANCE_TILE_64(b, b, tmp1, tmp2)
ADVANCE_TILE_64(b, b, tmp1, tmp2)
ADVANCE_TILE_64(b, b, tmp1, tmp2)
ADVANCE_TILE_64(b, b, tmp1, tmp2)
ADVANCE_TILE_64(b, b, tmp1, tmp2)
ADVANCE_TILE_64(b, b, tmp1, tmp2)
ADVANCE_TILE_64(b, b, tmp1, tmp2)
ADVANCE_TILE_64(b, b, tmp1, tmp2)
ADVANCE_TILE_64(b, b, tmp1, tmp2)
#endif
// Save into multiverse:
multiverse[threadnum + 64] = b;
// Zero remaining array:
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
if (usize == 0) { return; } // 19 tiles
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
if (usize == 1) { return; } // 37 tiles
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
if (usize == 2) { return; } // 91 tiles
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
}
__global__ void copyhashes_D2_p1(uint64_cu *multiverse, uint32_cu *univec, uint64_cu *hashes, uint32_cu offset, bool initial) {
// Initialise memory:
uint32_cu hashnum = blockIdx.x + offset;
uint32_cu uniidx = univec[blockIdx.x];
uint64_cu b = 0x4000600000ull + hashnum;
int usize = (multiverse[uniidx << 13] >> 19) & 3;
if (initial) { usize = 3; }
__syncthreads();
uint32_cu threadnum = (uniidx << 13) + threadIdx.x;
if (threadIdx.x) { b = 0; }
multiverse[threadnum] = b; b = 0;
// Copy SHA-256 hash:
if ((threadIdx.x >= 32) && (threadIdx.x < 48)) {
b = hashes[(hashnum << 2) + (threadIdx.x >> 2) - 8];
b = (b >> (16 * (threadIdx.x & 3)));
b = ((b & 0x00ff) << 32) | ((b & 0xff00) << 16);
} else if ((threadIdx.x > 16) && (threadIdx.x < 32)) {
uint32_cu ti = 64 - threadIdx.x;
b = hashes[(hashnum << 2) + (ti >> 2) - 8];
b = (b >> (16 * (ti & 3)));
b = ((b & 0x00ff) << 32) | ((b & 0xff00) << 16);
}
// Save into multiverse:
multiverse[threadnum + 64] = b;
// Zero remaining array:
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
if (usize == 0) { return; } // 19 tiles
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
if (usize == 1) { return; } // 37 tiles
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
if (usize == 2) { return; } // 91 tiles
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
}
__global__ void copyhashes_D2_p2(uint64_cu *multiverse, uint32_cu *univec, uint64_cu *hashes, uint32_cu offset, bool initial) {
// Initialise memory:
uint32_cu hashnum = blockIdx.x + offset;
uint32_cu uniidx = univec[blockIdx.x];
uint64_cu b = 0x4000600000ull + hashnum;
int usize = (multiverse[uniidx << 13] >> 19) & 3;
if (initial) { usize = 3; }
__syncthreads();
uint32_cu threadnum = (uniidx << 13) + threadIdx.x;
if (threadIdx.x) { b = 0; }
multiverse[threadnum] = b; b = 0;
// Copy SHA-256 hash:
if ((threadIdx.x >= 32) && (threadIdx.x < 48)) {
b = hashes[(hashnum << 2) + (threadIdx.x >> 2) - 8];
b = (b >> (16 * (threadIdx.x & 3)));
b = ((b & 0x00ff) << 32) | ((b & 0xff00) << 16);
} else if ((threadIdx.x >= 16) && (threadIdx.x < 32)) {
uint32_cu ti = 63 - threadIdx.x;
b = hashes[(hashnum << 2) + (ti >> 2) - 8];
b = (b >> (16 * (ti & 3)));
b = ((b & 0x00ff) << 32) | ((b & 0xff00) << 16);
}
// Save into multiverse:
multiverse[threadnum + 64] = b;
// Zero remaining array:
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
if (usize == 0) { return; } // 19 tiles
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
if (usize == 1) { return; } // 37 tiles
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
if (usize == 2) { return; } // 91 tiles
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
ZERO_SIX_TILES; ZERO_SIX_TILES; ZERO_SIX_TILES;
}
void copyhashes(std::string full_symmetry, int universes_left, uint64_cu *multiverse,
uint32_cu *univec, uint64_cu *hashes, uint32_cu offset, bool initial) {
if (universes_left <= 0) { return; }
std::string symmetry = full_symmetry;
if (symmetry[0] == 'G') {
symmetry = "C" + symmetry.substr(1);
} else if (symmetry[0] == 'H') {
symmetry = "D" + symmetry.substr(1);
}
if (symmetry == "C1") {
copyhashes_C1<<<universes_left, 64>>>(multiverse, univec, hashes, offset, initial);
} else if (symmetry == "D2_+1") {
copyhashes_D2_p1<<<universes_left, 64>>>(multiverse, univec, hashes, offset, initial);
} else if (symmetry == "D2_+2") {
copyhashes_D2_p2<<<universes_left, 64>>>(multiverse, univec, hashes, offset, initial);
} else {
std::cerr << "Fatal: symmetry " << symmetry << " unrecognised!!!" << std::endl;
exit(1);
}
}
#pragma once
#include "basics.h"
__global__ void exclusive_scan_uint32_256(uint32_cu *ina, uint32_cu *outa, uint64_cu *total) {
__shared__ uint32_cu tmp1[256];
__shared__ uint32_cu tmp2[256];
uint32_cu b = ina[(blockIdx.x << 8) + threadIdx.x];
uint32_cu a = b;
tmp1[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 1) { b += tmp1[threadIdx.x - 1]; }
tmp2[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 2) { b += tmp2[threadIdx.x - 2]; }
tmp1[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 4) { b += tmp1[threadIdx.x - 4]; }
tmp2[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 8) { b += tmp2[threadIdx.x - 8]; }
tmp1[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 16) { b += tmp1[threadIdx.x - 16]; }
tmp2[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 32) { b += tmp2[threadIdx.x - 32]; }
tmp1[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 64) { b += tmp1[threadIdx.x - 64]; }
tmp2[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 128) { b += tmp2[threadIdx.x - 128]; }
outa[(blockIdx.x << 8) + threadIdx.x] = b - a;
if (threadIdx.x == 255) { total[blockIdx.x] = b; }
}
__global__ void exclusive_scan_uint32(uint32_cu *ina, uint32_cu *outa, uint64_cu *total) {
__shared__ uint32_cu tmp1[128];
__shared__ uint32_cu tmp2[128];
uint32_cu b = ina[(blockIdx.x << 7) + threadIdx.x];
uint32_cu a = b;
tmp1[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 1) { b += tmp1[threadIdx.x - 1]; }
tmp2[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 2) { b += tmp2[threadIdx.x - 2]; }
tmp1[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 4) { b += tmp1[threadIdx.x - 4]; }
tmp2[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 8) { b += tmp2[threadIdx.x - 8]; }
tmp1[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 16) { b += tmp1[threadIdx.x - 16]; }
tmp2[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 32) { b += tmp2[threadIdx.x - 32]; }
tmp1[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 64) { b += tmp1[threadIdx.x - 64]; }
outa[(blockIdx.x << 7) + threadIdx.x] = b - a;
if (threadIdx.x == 127) { total[blockIdx.x] = b; }
}
__global__ void exclusive_scan_uint64_128(uint64_cu *ina, uint64_cu *outa, uint64_cu *total) {
__shared__ uint64_cu tmp1[128];
__shared__ uint64_cu tmp2[128];
uint64_cu b = ina[(blockIdx.x << 7) + threadIdx.x];
uint64_cu a = b;
tmp1[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 1) { b += tmp1[threadIdx.x - 1]; }
tmp2[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 2) { b += tmp2[threadIdx.x - 2]; }
tmp1[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 4) { b += tmp1[threadIdx.x - 4]; }
tmp2[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 8) { b += tmp2[threadIdx.x - 8]; }
tmp1[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 16) { b += tmp1[threadIdx.x - 16]; }
tmp2[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 32) { b += tmp2[threadIdx.x - 32]; }
tmp1[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 64) { b += tmp1[threadIdx.x - 64]; }
outa[(blockIdx.x << 7) + threadIdx.x] = b - a;
if (threadIdx.x == 127) { total[blockIdx.x] = b; }
}
__global__ void exclusive_scan_uint64(uint64_cu *ina, uint64_cu *outa, uint64_cu *total) {
__shared__ uint64_cu tmp1[64];
__shared__ uint64_cu tmp2[64];
uint64_cu b = ina[(blockIdx.x << 6) + threadIdx.x];
uint64_cu a = b;
tmp1[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 1) { b += tmp1[threadIdx.x - 1]; }
tmp2[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 2) { b += tmp2[threadIdx.x - 2]; }
tmp1[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 4) { b += tmp1[threadIdx.x - 4]; }
tmp2[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 8) { b += tmp2[threadIdx.x - 8]; }
tmp1[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 16) { b += tmp1[threadIdx.x - 16]; }
tmp2[threadIdx.x] = b;
__syncthreads();
if (threadIdx.x >= 32) { b += tmp2[threadIdx.x - 32]; }
outa[(blockIdx.x << 6) + threadIdx.x] = b - a;
if (threadIdx.x == 63) { total[blockIdx.x] = b; }
}
This diff is collapsed.
#pragma once
#include "basics.h"
/*
* Takes a 64x64 universe stored across register 'a' of a block of
* 64 CUDA threads, iterates it one generation (with toroidal wrap),
* and stores the result in register 'b'. The pointers 'tmp' and 'dst'
* must refer to disjoint 512-byte blocks of shared memory.
*/
#include "../avxlife/lifelogic/iterators_gpu.h"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment