Commit 02cb35a9 authored by Adam P. Goucher's avatar Adam P. Goucher

Use of __restrict__ keyword to allow compiler optimisations

parent a3458a6e
Pipeline #49150908 passed with stages
in 7 minutes and 43 seconds
#ifndef LIFELIB_VERSION /*
__version__=[x.replace('"', '') for x in '''
*/
#define LIFELIB_VERSION "ll2.1.19"
#define LIFELIB_VERSION "ll2.1.20"
// '''.split() if ('ll' in x)][0][2:]
#endif
......@@ -20,6 +20,66 @@
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0};
template<int H, int K>
inline void _copyBoundary3(uint32_t * __restrict__ d, uint32_t * __restrict__ n_d) {
constexpr static uint32_t middle = ((1u << (32 - K)) - (1u << K)); // e.g. 0x3ffffffcu
constexpr static uint32_t left = - (1u << K) ; // e.g. 0xfffffffcu
for (int i = K; i < H + K; i++) {
d[i] = ((n_d[i] & middle) >> 28) | (d[i] & left);
}
}
template<int H, int K>
inline void _copyBoundary0(uint32_t * __restrict__ d, uint32_t * __restrict__ n_d) {
constexpr static uint32_t middle = ((1u << (32 - K)) - (1u << K)); // e.g. 0x3ffffffcu
constexpr static uint32_t right = ((1u << (32 - K)) - 1u); // e.g. 0x3fffffffu
for (int i = K; i < H + K; i++) {
d[i] = ((n_d[i] & middle) << 28) | (d[i] & right);
}
}
template<int H, int K>
inline void _copyBoundary12(uint32_t * __restrict__ d, uint32_t * __restrict__ n1_d, uint32_t * __restrict__ n2_d) {
for (int i = 0; i < K; i++) {
d[i] = ((n1_d[H+i] << 14) & 0xffff0000u) | ((n2_d[H+i] >> 14) & 0x0000ffffu);
}
}
template<int H, int K>
inline void _copyBoundary1(uint32_t * __restrict__ d, uint32_t * __restrict__ n_d) {
for (int i = 0; i < K; i++) {
d[i] = ((n_d[H+i] << 14) & 0xffff0000u) | (d[i] & 0x0000ffffu);
}
}
template<int H, int K>
inline void _copyBoundary2(uint32_t * __restrict__ d, uint32_t * __restrict__ n_d) {
for (int i = 0; i < K; i++) {
d[i] = ((n_d[H+i] >> 14) & 0x0000ffffu) | (d[i] & 0xffff0000u);
}
}
template<int H, int K>
inline void _copyBoundary45(uint32_t * __restrict__ d, uint32_t * __restrict__ n4_d, uint32_t * __restrict__ n5_d) {
for (int i = K; i < 2*K; i++) {
d[H+i] = ((n5_d[i] << 14) & 0xffff0000u) | ((n4_d[i] >> 14) & 0x0000ffffu);
}
}
template<int H, int K>
inline void _copyBoundary4(uint32_t * __restrict__ d, uint32_t * __restrict__ n_d) {
for (int i = K; i < 2*K; i++) {
d[H+i] = ((n_d[i] >> 14) & 0x0000ffffu) | (d[H+i] & 0xffff0000u);
}
}
template<int H, int K>
inline void _copyBoundary5(uint32_t * __restrict__ d, uint32_t * __restrict__ n_d) {
for (int i = K; i < 2*K; i++) {
d[H+i] = ((n_d[i] << 14) & 0xffff0000u) | (d[H+i] & 0x0000ffffu);
}
}
template<int H, int K = 2>
struct VTile {
......@@ -49,46 +109,37 @@
}
void copyBoundary12(VTile<H,K> *n1, VTile<H,K> *n2) __attribute__((always_inline)) {
for (int i = 0; i < K; i++) {
d[i] = ((n1->d[H+i] << 14) & 0xffff0000u) | ((n2->d[H+i] >> 14) & 0x0000ffffu);
}
_copyBoundary12<H, K>(d, n1->d, n2->d);
}
void copyBoundary1(VTile<H,K> *n) __attribute__((always_inline)) {
for (int i = 0; i < K; i++) {
d[i] = ((n->d[H+i] << 14) & 0xffff0000u) | (d[i] & 0x0000ffffu);
}
_copyBoundary1<H, K>(d, n->d);
}
void copyBoundary2(VTile<H,K> *n) __attribute__((always_inline)) {
for (int i = 0; i < K; i++) {
d[i] = ((n->d[H+i] >> 14) & 0x0000ffffu) | (d[i] & 0xffff0000u);
}
_copyBoundary2<H, K>(d, n->d);
}
void copyBoundary45(VTile<H,K> *n4, VTile<H,K> *n5) __attribute__((always_inline)) {
for (int i = K; i < 2*K; i++) {
d[H+i] = ((n5->d[i] << 14) & 0xffff0000u) | ((n4->d[i] >> 14) & 0x0000ffffu);
}
_copyBoundary45<H, K>(d, n4->d, n5->d);
}
void copyBoundary4(VTile<H,K> *n) __attribute__((always_inline)) {
for (int i = K; i < 2*K; i++) {
d[H+i] = ((n->d[i] >> 14) & 0x0000ffffu) | (d[H+i] & 0xffff0000u);
}
_copyBoundary4<H, K>(d, n->d);
}
void copyBoundary5(VTile<H,K> *n) __attribute__((always_inline)) {
for (int i = K; i < 2*K; i++) {
d[H+i] = ((n->d[i] << 14) & 0xffff0000u) | (d[H+i] & 0x0000ffffu);
}
_copyBoundary5<H, K>(d, n->d);
}
void copyBoundary3(VTile<H,K> *n) {
_copyBoundary3<H, K>(d, n->d);
}
void copyBoundary0(VTile<H,K> *n) {
_copyBoundary0<H, K>(d, n->d);
}
// These functions depend on hand-written assembly which differs
// depending on the values of H and K. We therefore need to define
// the functions later, and merely declare them here:
void copyBoundary3(VTile<H,K> *n);
void copyBoundary0(VTile<H,K> *n);
inline void updateTile(upattern<VTile<H,K>, 32 - 2*K, H>* owner, int rule, int family, int mantissa) __attribute__((always_inline));
// ^^^ we really do need both the prefix 'inline' and the attribute 'always_inline' for this to work ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
......@@ -200,9 +251,6 @@
if ((v == 0) || (z >= 2)) { return; }
if ((dx < 8) || (dx > 16) || (dy < 8) || (dy > H + 2*K - 16)) {
for (int i = 0; i < 6; i++) { owner->updateNeighbour(this, i); }
}
if (!(currentflags & 1)) { owner->popchanged.push_back(this); }
currentflags = 3;
if (updateflags == 0) { owner->modified.push_back(this); }
......@@ -210,15 +258,18 @@
uint32_t* q = (z ? hist : d);
if (dy > H - 8) {
owner->getNeighbour(this, 4)->eu64(owner, z, dx + (16 - K), 0, v >> (8 * (H - dy)));
}
if (dx >= (32 - 2*K)) {
owner->getNeighbour(this, 0)->eu64(owner, z, dx - (32 - 2*K), dy, v);
} else if (dx > (24 - 2*K)) {
uint64_t bitmask = 0x0101010101010101ull;
bitmask = (bitmask << (dx - (24 - 2*K))) - bitmask;
owner->getNeighbour(this, 0)->eu64(owner, z, 0, dy, (v >> ((32 - 2*K) - dx)) & bitmask);
if ((dx < 8) || (dx > 16) || (dy < 8) || (dy > H + 2*K - 16)) {
for (int i = 0; i < 6; i++) { owner->updateNeighbour(this, i); }
if (dy > H - 8) {
owner->getNeighbour(this, 4)->eu64(owner, z, dx + (16 - K), 0, v >> (8 * (H - dy)));
}
if (dx >= (32 - 2*K)) {
owner->getNeighbour(this, 0)->eu64(owner, z, dx - (32 - 2*K), dy, v);
} else if (dx > (24 - 2*K)) {
uint64_t bitmask = 0x0101010101010101ull;
bitmask = (bitmask << (dx - (24 - 2*K))) - bitmask;
owner->getNeighbour(this, 0)->eu64(owner, z, 0, dy, (v >> ((32 - 2*K) - dx)) & bitmask);
}
}
if (dx < (32 - 2*K)) {
......
template<>
void VTile<28>::copyBoundary3(VTile<28> *n) {
#ifdef __AVX512F__
asm (
"vmovdqu64 8(%0), %%zmm6 \n\t"
"vmovdqu64 8(%1), %%zmm8 \n\t"
"vmovdqu64 (%2), %%zmm14 \n\t"
"vpsrld $28, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vmovdqu64 %%zmm6, 8(%0) \n\t"
"vmovdqu 72(%0), %%ymm6 \n\t"
"vmovdqu 104(%0), %%xmm13 \n\t"
"vshufi32x4 $68, %%zmm13, %%zmm6, %%zmm6 \n\t"
"vmovdqu 72(%1), %%ymm8 \n\t"
"vmovdqu 104(%1), %%xmm13 \n\t"
"vshufi32x4 $68, %%zmm13, %%zmm8, %%zmm8 \n\t"
"vpsrld $28, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vshufi32x4 $78, %%zmm6, %%zmm6, %%zmm13 \n\t"
"vmovdqu %%ymm6, 72(%0) \n\t"
"vmovdqu %%xmm13, 104(%0) \n\t"
: /* no output operands */
: "r" (d), "r" (n->d), "r" (apg::__16xfffffffc)
: "xmm6", "xmm8", "xmm13", "xmm14", "memory");
#else
for (int i = 2; i < 30; i++) {
d[i] = ((n->d[i] & 0x3ffffffcu) >> 28) | (d[i] & 0xfffffffcu);
}
#endif
}
template<>
void VTile<28>::copyBoundary0(VTile<28> *n) {
#ifdef __AVX512F__
asm (
"vmovdqu64 8(%0), %%zmm6 \n\t"
"vmovdqu64 8(%1), %%zmm8 \n\t"
"vmovdqu64 (%2), %%zmm14 \n\t"
"vpslld $28, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vmovdqu64 %%zmm6, 8(%0) \n\t"
"vmovdqu 72(%0), %%ymm6 \n\t"
"vmovdqu 104(%0), %%xmm13 \n\t"
"vshufi32x4 $68, %%zmm13, %%zmm6, %%zmm6 \n\t"
"vmovdqu 72(%1), %%ymm8 \n\t"
"vmovdqu 104(%1), %%xmm13 \n\t"
"vshufi32x4 $68, %%zmm13, %%zmm8, %%zmm8 \n\t"
"vpslld $28, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vshufi32x4 $78, %%zmm6, %%zmm6, %%zmm13 \n\t"
"vmovdqu %%ymm6, 72(%0) \n\t"
"vmovdqu %%xmm13, 104(%0) \n\t"
: /* no output operands */
: "r" (d), "r" (n->d), "r" (apg::__16x3fffffff)
: "xmm6", "xmm8", "xmm13", "xmm14", "memory");
#else
for (int i = 2; i < 30; i++) {
d[i] = ((n->d[i] & 0x3ffffffcu) << 28) | (d[i] & 0x3fffffffu);
}
#endif
}
template<>
inline void VTile<28>::updateTile(upattern<VTile<28>, 28, 28>* owner, int rule, int family, int mantissa) {
......
template<>
void VTile<44>::copyBoundary3(VTile<44> *n) {
#ifdef __AVX512F__
asm (
"vmovdqu64 8(%0), %%zmm6 \n\t"
"vmovdqu64 8(%1), %%zmm8 \n\t"
"vmovdqu64 (%2), %%zmm14 \n\t"
"vpsrld $28, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vmovdqu64 %%zmm6, 8(%0) \n\t"
"vmovdqu64 72(%0), %%zmm6 \n\t"
"vmovdqu64 72(%1), %%zmm8 \n\t"
"vpsrld $28, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vmovdqu64 %%zmm6, 72(%0) \n\t"
"vmovdqu 136(%0), %%ymm6 \n\t"
"vmovdqu 168(%0), %%xmm13 \n\t"
"vshufi32x4 $68, %%zmm13, %%zmm6, %%zmm6 \n\t"
"vmovdqu 136(%1), %%ymm8 \n\t"
"vmovdqu 168(%1), %%xmm13 \n\t"
"vshufi32x4 $68, %%zmm13, %%zmm8, %%zmm8 \n\t"
"vpsrld $28, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vshufi32x4 $78, %%zmm6, %%zmm6, %%zmm13 \n\t"
"vmovdqu %%ymm6, 136(%0) \n\t"
"vmovdqu %%xmm13, 168(%0) \n\t"
: /* no output operands */
: "r" (d), "r" (n->d), "r" (apg::__16xfffffffc)
: "xmm6", "xmm8", "xmm13", "xmm14", "memory");
#else
for (int i = 2; i < 46; i++) {
d[i] = ((n->d[i] & 0x3ffffffcu) >> 28) | (d[i] & 0xfffffffcu);
}
#endif
}
template<>
void VTile<44>::copyBoundary0(VTile<44> *n) {
#ifdef __AVX512F__
asm (
"vmovdqu64 8(%0), %%zmm6 \n\t"
"vmovdqu64 8(%1), %%zmm8 \n\t"
"vmovdqu64 (%2), %%zmm14 \n\t"
"vpslld $28, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vmovdqu64 %%zmm6, 8(%0) \n\t"
"vmovdqu64 72(%0), %%zmm6 \n\t"
"vmovdqu64 72(%1), %%zmm8 \n\t"
"vpslld $28, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vmovdqu64 %%zmm6, 72(%0) \n\t"
"vmovdqu 136(%0), %%ymm6 \n\t"
"vmovdqu 168(%0), %%xmm13 \n\t"
"vshufi32x4 $68, %%zmm13, %%zmm6, %%zmm6 \n\t"
"vmovdqu 136(%1), %%ymm8 \n\t"
"vmovdqu 168(%1), %%xmm13 \n\t"
"vshufi32x4 $68, %%zmm13, %%zmm8, %%zmm8 \n\t"
"vpslld $28, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vshufi32x4 $78, %%zmm6, %%zmm6, %%zmm13 \n\t"
"vmovdqu %%ymm6, 136(%0) \n\t"
"vmovdqu %%xmm13, 168(%0) \n\t"
: /* no output operands */
: "r" (d), "r" (n->d), "r" (apg::__16x3fffffff)
: "xmm6", "xmm8", "xmm13", "xmm14", "memory");
#else
for (int i = 2; i < 46; i++) {
d[i] = ((n->d[i] & 0x3ffffffcu) << 28) | (d[i] & 0x3fffffffu);
}
#endif
}
template<>
inline void VTile<44>::updateTile(upattern<VTile<44>, 28, 44>* owner, int rule, int family, int mantissa) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment