Commit 5f0cd4b7 authored by Adam P. Goucher's avatar Adam P. Goucher

Added iter4_var_leaf

parent ff4306af
......@@ -2,6 +2,44 @@
* This file is #included into each outer-totalistic rule namespace.
*/
uint64_t r32_centre_to_u64(uint32_t* d) {
uint64_t x = 0;
for (int i = 11; i >= 4; i--) {
x = x << 8;
x |= (d[i] >> 12) & 255;
}
return x;
}
void iter4_var_leaf(uint64_t * inleaf, uint64_t * centres) {
/*
* Find the 8-by-8 centre after iterating a 16-by-16 leaf for a
* further 4 iterations in the rule. This returns both the
* live cells and the history envelope.
*/
int bis = apg::best_instruction_set();
uint32_t d[16];
uint32_t e[16];
uint32_t h[16];
std::memset(h, 0, 64);
if (bis >= 10) {
apg::z64_to_r32_centre_avx(inleaf, d);
iterate_avx2_16_12(d, e, h, 0, 0, false);
iterate_avx2_12_8(d+2, e+2, h+2, 0, 0, false);
} else if (bis >= 9) {
apg::z64_to_r32_centre_avx(inleaf, d);
iterate_avx_16_12(d, e, h, 0, 0, false);
iterate_avx_12_8(d+2, e+2, h+2, 0, 0, false);
} else {
apg::z64_to_r32_centre_ssse3(inleaf, d);
iterate_sse2_16_12(d, e, h, 0, 0, false);
iterate_sse2_12_8(d+2, e+2, h+2, 0, 0, false);
}
centres[0] = r32_centre_to_u64(d);
centres[1] = r32_centre_to_u64(h);
}
bool iterate_var_leaf(int n, uint64_t * inleaves, uint64_t * outleaf) {
bool nochange = false;
int bis = apg::best_instruction_set();
......
......@@ -43,5 +43,23 @@ namespace apg {
0x00ffff00u,
1, 2, 3, 4, 5, 6, 7, 0};
const static uint32_t __sixteen12[] __attribute__((aligned(64))) = {0x003ffc00u,
0x003ffc00u,
0x003ffc00u,
0x003ffc00u,
0x003ffc00u,
0x003ffc00u,
0x003ffc00u,
0x003ffc00u,
1, 2, 3, 4, 5, 6, 7, 0};
const static uint32_t __sixteen8[] __attribute__((aligned(64))) = {0x000ff000u,
0x000ff000u,
0x000ff000u,
0x000ff000u,
0x000ff000u,
0x000ff000u,
0x000ff000u,
0x000ff000u,
1, 2, 3, 4, 5, 6, 7, 0};
}
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -290,6 +290,60 @@ namespace apg {
}
void z64_to_r32_centre_ssse3(uint64_t* c, uint32_t* b) {
/*
* #ab#
* #cd# <--- [a, b, c, d]
*/
asm (
// Load from memory:
"movups (%0), %%xmm0 \n\t"
"movups 16(%0), %%xmm2 \n\t"
// Permute bytes:
"pshufb (%2), %%xmm0 \n\t"
"pshufb (%2), %%xmm2 \n\t"
// Dirty hack to perform << 8 and >> 8 during movups:
"movups %%xmm0, 1(%1) \n\t"
"movups %%xmm1, 15(%1) \n\t"
"movups %%xmm2, 33(%1) \n\t"
"movups %%xmm3, 47(%1) \n\t"
: /* no output operands -- implicitly volatile */
: "r" (c), "r" (b), "r" (__lifeperm)
: "xmm0", "xmm1", "xmm2", "xmm3", "memory" );
}
void z64_to_r32_centre_avx(uint64_t* c, uint32_t* b) {
/*
* #ab#
* #cd# <--- [a, b, c, d]
*/
asm (
// Load from memory:
"vmovups (%0), %%xmm0 \n\t"
"vmovups 16(%0), %%xmm2 \n\t"
// Permute bytes:
"vpshufb (%2), %%xmm0, %%xmm0 \n\t"
"vpshufb (%2), %%xmm2, %%xmm2 \n\t"
// Dirty hack to perform << 8 and >> 8 during movups:
"vmovups %%xmm0, 1(%1) \n\t"
"vmovups %%xmm1, 15(%1) \n\t"
"vmovups %%xmm2, 33(%1) \n\t"
"vmovups %%xmm3, 47(%1) \n\t"
: /* no output operands -- implicitly volatile */
: "r" (c), "r" (b), "r" (__lifeperm)
: "xmm0", "xmm1", "xmm2", "xmm3", "memory" );
}
void r32_centre_to_z64_ssse3(uint32_t* b, uint64_t* c) {
/*
* Selects the 16-by-16 square in the centre of a 32-by-32
......
......@@ -691,6 +691,8 @@ def makeasm(rulestring):
iw.write_function(rulestring, 28, 24)
iw.write_function(rulestring, 24, 20)
iw.write_function(rulestring, 20, 16)
iw.write_function(rulestring, 16, 12)
iw.write_function(rulestring, 12, 8)
iw.write_iterator()
if (rulestring[0] == 'g'):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment