Commit ae802774 authored by Adam P. Goucher's avatar Adam P. Goucher

Initial commit

parents
This diff is collapsed.
#pragma once
#include <stdint.h>
#include <cstring>
#include <cpuid.h>
/*
* This calls the CPUID instruction to determine the capabilities of the
* underlying architecture.
*/
namespace apg {
char __cpu_name[] = "I don't know";
uint32_t __best_instruction_set = 0;
// Linear progression of increasingly good instruction sets:
uint32_t __have_mmx = 0; // 2
uint32_t __have_sse = 0; // 3
uint32_t __have_sse2 = 0; // 4
uint32_t __have_sse3 = 0; // 5
uint32_t __have_ssse3 = 0; // 6
uint32_t __have_sse4_1 = 0; // 7
uint32_t __have_sse4_2 = 0; // 8
uint32_t __have_avx = 0; // 9
uint32_t __have_avx2 = 0; // 10
uint32_t __have_avx512 = 0; // 11
// AVX-512 subsets:
uint32_t __have_avx512f = 0;
uint32_t __have_avx512dq = 0;
uint32_t __have_avx512ifma = 0;
uint32_t __have_avx512pf = 0;
uint32_t __have_avx512er = 0;
uint32_t __have_avx512cd = 0;
uint32_t __have_avx512bw = 0;
uint32_t __have_avx512vl = 0;
// Miscellaneous bonus instructions:
uint32_t __have_aes = 0;
uint32_t __have_sha = 0;
uint32_t __have_popcnt = 0;
uint32_t __have_bmi1 = 0;
uint32_t __have_bmi2 = 0;
uint32_t __have_fma = 0;
uint32_t __have_fma3 = 0;
/*
* Apple Bottom's vector instruction set detector (modified)
*/
int best_instruction_set() {
if (__best_instruction_set == 0) {
uint32_t eax, ebx, ecx, edx;
__cpuid(0, eax, ebx, ecx, edx);
uint32_t max_level = eax;
std::memcpy(__cpu_name, &ebx, 4);
std::memcpy(__cpu_name + 4, &edx, 4);
std::memcpy(__cpu_name + 8, &ecx, 4);
__cpuid(1, eax, ebx, ecx, edx);
__have_mmx = ((edx >> 23) & 1);
__have_sse = ((edx >> 25) & 1);
__have_sse2 = ((edx >> 26) & 1);
__have_sse3 = (ecx & 1);
__have_ssse3 = ((ecx >> 9) & 1);
__have_fma = ((ecx >> 12) & 1);
__have_fma3 = ((ecx >> 12) & 1);
__have_sse4_1 = ((ecx >> 19) & 1);
__have_sse4_2 = ((ecx >> 20) & 1);
__have_popcnt = ((ecx >> 23) & 1);
__have_aes = ((ecx >> 25) & 1);
__have_avx = ((ecx >> 28) & 1);
if (max_level >= 7) {
__cpuid_count(7, 0, eax, ebx, ecx, edx);
__have_bmi1 = ((ebx >> 3) & 1);
__have_avx2 = ((ebx >> 5) & 1);
__have_bmi2 = ((ebx >> 8) & 1);
__have_avx512 = ((ebx >> 16) & 1);
__have_avx512f = ((ebx >> 16) & 1);
__have_avx512dq = ((ebx >> 17) & 1);
__have_avx512ifma = ((ebx >> 21) & 1);
__have_avx512pf = ((ebx >> 26) & 1);
__have_avx512er = ((ebx >> 27) & 1);
__have_avx512cd = ((ebx >> 28) & 1);
__have_sha = ((ebx >> 29) & 1);
__have_avx512bw = ((ebx >> 30) & 1);
__have_avx512vl = ((ebx >> 31) & 1);
}
if (__have_avx512) { __best_instruction_set = 11; } else
if (__have_avx2) { __best_instruction_set = 10; } else
if (__have_avx) { __best_instruction_set = 9; } else
if (__have_sse4_2) { __best_instruction_set = 8; } else
if (__have_sse4_1) { __best_instruction_set = 7; } else
if (__have_ssse3) { __best_instruction_set = 6; } else
if (__have_sse3) { __best_instruction_set = 5; } else
if (__have_sse2) { __best_instruction_set = 4; } else
if (__have_sse) { __best_instruction_set = 3; } else
if (__have_mmx) { __best_instruction_set = 2; } else
if (true) { __best_instruction_set = 1; }
}
return __best_instruction_set;
}
char* cpu_name() {
best_instruction_set();
return __cpu_name;
}
}
#pragma once
#include <stdint.h>
namespace apg {
const static uint32_t __sixteen28[] __attribute__((aligned(64))) = {0x3ffffffcu,
0x3ffffffcu,
0x3ffffffcu,
0x3ffffffcu,
0x3ffffffcu,
0x3ffffffcu,
0x3ffffffcu,
0x3ffffffcu,
1, 2, 3, 4, 5, 6, 7, 0};
const static uint32_t __sixteen24[] __attribute__((aligned(64))) = {0x0ffffff0u,
0x0ffffff0u,
0x0ffffff0u,
0x0ffffff0u,
0x0ffffff0u,
0x0ffffff0u,
0x0ffffff0u,
0x0ffffff0u,
1, 2, 3, 4, 5, 6, 7, 0};
const static uint32_t __sixteen20[] __attribute__((aligned(64))) = {0x03ffffc0u,
0x03ffffc0u,
0x03ffffc0u,
0x03ffffc0u,
0x03ffffc0u,
0x03ffffc0u,
0x03ffffc0u,
0x03ffffc0u,
1, 2, 3, 4, 5, 6, 7, 0};
const static uint32_t __sixteen16[] __attribute__((aligned(64))) = {0x00ffff00u,
0x00ffff00u,
0x00ffff00u,
0x00ffff00u,
0x00ffff00u,
0x00ffff00u,
0x00ffff00u,
0x00ffff00u,
1, 2, 3, 4, 5, 6, 7, 0};
}
#pragma once
#include <string>
#include "iterators_b3s23.h"
namespace apg {
int rule2int(std::string rule) {
if (rule == "b3s23") { return 0; }
return -1;
}
void iterate_var_leaf(int rule, int n, uint64_t * inleaves, uint64_t * outleaf) {
switch(rule) {
case 0 :
b3s23::iterate_var_leaf(n, inleaves, outleaf);
break;
}
}
void iterate_var_leaf(int rule, int n, uint64_t * inleaves, uint64_t * hleaves, uint64_t * outleaf) {
switch(rule) {
case 0 :
b3s23::iterate_var_leaf(n, inleaves, hleaves, outleaf);
break;
}
}
void iterate_var_leaf(int rule, int n, uint64_t * inleaves, uint64_t * hleaves, uint64_t * jleaves, uint64_t * outleaf) {
switch(rule) {
case 0 :
b3s23::iterate_var_leaf(n, inleaves, hleaves, jleaves, outleaf);
break;
}
}
}
This diff is collapsed.
"vpand %%ymm8, %%ymm11, %%ymm1 \n\t"
"vpxor %%ymm11, %%ymm8, %%ymm8 \n\t"
"vpxor %%ymm1, %%ymm9, %%ymm9 \n\t"
"vpxor %%ymm9, %%ymm8, %%ymm8 \n\t"
"vpor %%ymm10, %%ymm12, %%ymm12 \n\t"
"vpxor %%ymm9, %%ymm10, %%ymm10 \n\t"
"vpand %%ymm12, %%ymm8, %%ymm8 \n\t"
"vpand %%ymm8, %%ymm10, %%ymm10 \n\t"
"vpand %%xmm8, %%xmm11, %%xmm1 \n\t"
"vpxor %%xmm11, %%xmm8, %%xmm8 \n\t"
"vpxor %%xmm1, %%xmm9, %%xmm9 \n\t"
"vpxor %%xmm9, %%xmm8, %%xmm8 \n\t"
"vpor %%xmm10, %%xmm12, %%xmm12 \n\t"
"vpxor %%xmm9, %%xmm10, %%xmm10 \n\t"
"vpand %%xmm12, %%xmm8, %%xmm8 \n\t"
"vpand %%xmm8, %%xmm10, %%xmm10 \n\t"
"movdqa %%xmm11, %%xmm1 \n\t"
"pand %%xmm8, %%xmm1 \n\t"
"pxor %%xmm11, %%xmm8 \n\t"
"pxor %%xmm1, %%xmm9 \n\t"
"pxor %%xmm9, %%xmm8 \n\t"
"por %%xmm10, %%xmm12 \n\t"
"pxor %%xmm9, %%xmm10 \n\t"
"pand %%xmm12, %%xmm8 \n\t"
"pand %%xmm8, %%xmm10 \n\t"
#pragma once
#include <stdint.h>
namespace apg {
// The first 32 bytes are arguments to (V)PSHUFB; the remaining
// 32 bytes are arguments to VPERMD. We align on a 64-byte boundary
// for both (16-byte) SIMD necessity and to avoid cache misses.
const static uint8_t __lifeperm[] __attribute__((aligned(64))) = {0,
4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15,
0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15,
0, 0, 0, 0, 4, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0,
1, 0, 0, 0, 5, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0};
void z64_to_r32_sse2(uint64_t* a, uint32_t* b) {
/*
* Converts a Z-ordered array of 16 uint64s, each of which encodes
* an 8-by-8 subsquare of a 32-by-32 square, into an array of 32
* uint32s, each of which represents a row.
*/
asm (
// Load bytes 0 -- 63 into registers:
"movups (%0), %%xmm0 \n\t"
"movups 16(%0), %%xmm3 \n\t"
"movups 32(%0), %%xmm1 \n\t"
"movups 48(%0), %%xmm4 \n\t"
// Bit cycle, round I:
"movdqa %%xmm0, %%xmm2 \n\t"
"movdqa %%xmm3, %%xmm5 \n\t"
"punpcklbw %%xmm1, %%xmm0 \n\t"
"punpcklbw %%xmm4, %%xmm3 \n\t"
"punpckhbw %%xmm1, %%xmm2 \n\t"
"punpckhbw %%xmm4, %%xmm5 \n\t"
// Bit cycle, round II:
"movdqa %%xmm0, %%xmm1 \n\t"
"movdqa %%xmm3, %%xmm4 \n\t"
"punpcklbw %%xmm2, %%xmm0 \n\t"
"punpcklbw %%xmm5, %%xmm3 \n\t"
"punpckhbw %%xmm2, %%xmm1 \n\t"
"punpckhbw %%xmm5, %%xmm4 \n\t"
// Save bytes 0 -- 63 back into memory:
"movups %%xmm0, 0(%1) \n\t"
"movups %%xmm1, 16(%1) \n\t"
"movups %%xmm3, 32(%1) \n\t"
"movups %%xmm4, 48(%1) \n\t"
// Load bytes 64 -- 127 into registers:
"movups 64(%0), %%xmm0 \n\t"
"movups 80(%0), %%xmm3 \n\t"
"movups 96(%0), %%xmm1 \n\t"
"movups 112(%0), %%xmm4 \n\t"
// Bit cycle, round I:
"movdqa %%xmm0, %%xmm2 \n\t"
"movdqa %%xmm3, %%xmm5 \n\t"
"punpcklbw %%xmm1, %%xmm0 \n\t"
"punpcklbw %%xmm4, %%xmm3 \n\t"
"punpckhbw %%xmm1, %%xmm2 \n\t"
"punpckhbw %%xmm4, %%xmm5 \n\t"
// Bit cycle, round II:
"movdqa %%xmm0, %%xmm1 \n\t"
"movdqa %%xmm3, %%xmm4 \n\t"
"punpcklbw %%xmm2, %%xmm0 \n\t"
"punpcklbw %%xmm5, %%xmm3 \n\t"
"punpckhbw %%xmm2, %%xmm1 \n\t"
"punpckhbw %%xmm5, %%xmm4 \n\t"
// Save bytes 64 -- 127 back into memory:
"movups %%xmm0, 64(%1) \n\t"
"movups %%xmm1, 80(%1) \n\t"
"movups %%xmm3, 96(%1) \n\t"
"movups %%xmm4, 112(%1) \n\t"
: /* no output operands -- implicitly volatile */
: "r" (a), "r" (b)
: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory" );
}
void z64_to_r32_avx(uint64_t* a, uint32_t* b) {
asm (
// Load bytes 0 -- 63 into registers:
"vmovups (%0), %%xmm0 \n\t"
"vmovups 16(%0), %%xmm3 \n\t"
"vmovups 32(%0), %%xmm1 \n\t"
"vmovups 48(%0), %%xmm4 \n\t"
// Bit cycle, round I:
"vpunpckhbw %%xmm1, %%xmm0, %%xmm2 \n\t"
"vpunpckhbw %%xmm4, %%xmm3, %%xmm5 \n\t"
"vpunpcklbw %%xmm1, %%xmm0, %%xmm0 \n\t"
"vpunpcklbw %%xmm4, %%xmm3, %%xmm3 \n\t"
// Bit cycle, round II:
"vpunpckhbw %%xmm2, %%xmm0, %%xmm1 \n\t"
"vpunpckhbw %%xmm5, %%xmm3, %%xmm4 \n\t"
"vpunpcklbw %%xmm2, %%xmm0, %%xmm0 \n\t"
"vpunpcklbw %%xmm5, %%xmm3, %%xmm3 \n\t"
// Save bytes 0 -- 63 back into memory:
"vmovups %%xmm0, 0(%1) \n\t"
"vmovups %%xmm1, 16(%1) \n\t"
"vmovups %%xmm3, 32(%1) \n\t"
"vmovups %%xmm4, 48(%1) \n\t"
// Load bytes 64 -- 127 into registers:
"vmovups 64(%0), %%xmm0 \n\t"
"vmovups 80(%0), %%xmm3 \n\t"
"vmovups 96(%0), %%xmm1 \n\t"
"vmovups 112(%0), %%xmm4 \n\t"
// Bit cycle, round I:
"vpunpckhbw %%xmm1, %%xmm0, %%xmm2 \n\t"
"vpunpckhbw %%xmm4, %%xmm3, %%xmm5 \n\t"
"vpunpcklbw %%xmm1, %%xmm0, %%xmm0 \n\t"
"vpunpcklbw %%xmm4, %%xmm3, %%xmm3 \n\t"
// Bit cycle, round II:
"vpunpckhbw %%xmm2, %%xmm0, %%xmm1 \n\t"
"vpunpckhbw %%xmm5, %%xmm3, %%xmm4 \n\t"
"vpunpcklbw %%xmm2, %%xmm0, %%xmm0 \n\t"
"vpunpcklbw %%xmm5, %%xmm3, %%xmm3 \n\t"
// Save bytes 64 -- 127 back into memory:
"vmovups %%xmm0, 64(%1) \n\t"
"vmovups %%xmm1, 80(%1) \n\t"
"vmovups %%xmm3, 96(%1) \n\t"
"vmovups %%xmm4, 112(%1) \n\t"
: /* no output operands -- implicitly volatile */
: "r" (a), "r" (b)
: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory" );
}
void z64_to_r32_avx2(uint64_t* a, uint32_t* b) {
asm (
// Load memory into registers:
"vmovups (%0), %%ymm0 \n\t"
"vmovups 32(%0), %%ymm1 \n\t"
"vmovups 64(%0), %%ymm2 \n\t"
"vmovups 96(%0), %%ymm3 \n\t"
// Bit cycle, round I:
"vpunpcklbw %%ymm1, %%ymm0, %%ymm4 \n\t"
"vpunpckhbw %%ymm1, %%ymm0, %%ymm5 \n\t"
"vpunpcklbw %%ymm3, %%ymm2, %%ymm6 \n\t"
"vpunpckhbw %%ymm3, %%ymm2, %%ymm7 \n\t"
// Exchange between low and high halves of ymm registers:
"vpermq $216, %%ymm4, %%ymm0 \n\t"
"vpermq $216, %%ymm5, %%ymm1 \n\t"
"vpermq $216, %%ymm6, %%ymm2 \n\t"
"vpermq $216, %%ymm7, %%ymm3 \n\t"
// Bit cycle, round II:
"vpunpcklbw %%ymm1, %%ymm0, %%ymm4 \n\t"
"vpunpckhbw %%ymm1, %%ymm0, %%ymm5 \n\t"
"vpunpcklbw %%ymm3, %%ymm2, %%ymm6 \n\t"
"vpunpckhbw %%ymm3, %%ymm2, %%ymm7 \n\t"
// Save registers back into memory:
"vmovups %%ymm4, 0(%1) \n\t"
"vmovups %%ymm5, 32(%1) \n\t"
"vmovups %%ymm6, 64(%1) \n\t"
"vmovups %%ymm7, 96(%1) \n\t"
: /* no output operands -- implicitly volatile */
: "r" (a), "r" (b)
: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory" );
}
void r32_centre_to_z64_ssse3(uint32_t* b, uint64_t* c) {
/*
* Selects the 16-by-16 square in the centre of a 32-by-32
* square encoded as an array of rows, and converts it to a
* Z-ordered array of 4 uint64s, each representing a 8-by-8
* subsquare:
*
* ####
* #ab#
* #cd# ---> [a, b, c, d]
* ####
*/
asm (
// Dirty hack to perform << 8 and >> 8 during movups:
"movups 31(%1), %%xmm0 \n\t"
"movups 49(%1), %%xmm1 \n\t"
"movups 63(%1), %%xmm2 \n\t"
"movups 81(%1), %%xmm3 \n\t"
"psrld $16, %%xmm0 \n\t"
"pslld $16, %%xmm1 \n\t"
"psrld $16, %%xmm2 \n\t"
"pslld $16, %%xmm3 \n\t"
// Alternately select words from two registers:
"por %%xmm1, %%xmm0 \n\t"
"por %%xmm3, %%xmm2 \n\t"
// Permute bytes:
"pshufb (%2), %%xmm0 \n\t"
"pshufb (%2), %%xmm2 \n\t"
// Save back into memory:
"movups %%xmm0, (%0) \n\t"
"movups %%xmm2, 16(%0) \n\t"
: /* no output operands -- implicitly volatile */
: "r" (c), "r" (b), "r" (__lifeperm)
: "xmm0", "xmm1", "xmm2", "xmm3", "memory" );
}
void r32_centre_to_z64_sse4(uint32_t* b, uint64_t* c) {
asm (
// Dirty hack to perform << 8 and >> 8 during movups:
"movups 33(%1), %%xmm0 \n\t"
"movups 47(%1), %%xmm1 \n\t"
"movups 65(%1), %%xmm2 \n\t"
"movups 79(%1), %%xmm3 \n\t"
// Alternately select words from two registers:
"pblendw $170, %%xmm1, %%xmm0 \n\t"
"pblendw $170, %%xmm3, %%xmm2 \n\t"
// Permute bytes:
"pshufb (%2), %%xmm0 \n\t"
"pshufb (%2), %%xmm2 \n\t"
// Save back into memory:
"movups %%xmm0, (%0) \n\t"
"movups %%xmm2, 16(%0) \n\t"
: /* no output operands -- implicitly volatile */
: "r" (c), "r" (b), "r" (__lifeperm)
: "xmm0", "xmm1", "xmm2", "xmm3", "memory" );
}
void r32_centre_to_z64_avx(uint32_t* b, uint64_t* c) {
asm (
// Dirty hack to perform << 8 and >> 8 during movups:
"vmovups 33(%1), %%xmm0 \n\t"
"vmovups 47(%1), %%xmm1 \n\t"
"vmovups 65(%1), %%xmm2 \n\t"
"vmovups 79(%1), %%xmm3 \n\t"
// Alternately select words from two registers:
"vpblendw $170, %%xmm1, %%xmm0, %%xmm0 \n\t"
"vpblendw $170, %%xmm3, %%xmm2, %%xmm2 \n\t"
// Permute bytes:
"vpshufb (%2), %%xmm0, %%xmm0 \n\t"
"vpshufb (%2), %%xmm2, %%xmm2 \n\t"
// Save back into memory:
"vmovups %%xmm0, (%0) \n\t"
"vmovups %%xmm2, 16(%0) \n\t"
: /* no output operands -- implicitly volatile */
: "r" (c), "r" (b), "r" (__lifeperm)
: "xmm0", "xmm1", "xmm2", "xmm3", "memory" );
}
void r32_centre_to_z64_avx2(uint32_t* b, uint64_t* c) {
asm (
// Only 7 instructions -- and 4 are memory operations:
"vmovups 33(%1), %%ymm0 \n\t"
"vmovups 63(%1), %%ymm1 \n\t"
"vmovups 32(%2), %%ymm2 \n\t"
"vpblendw $170, %%ymm1, %%ymm0, %%ymm0 \n\t"
"vpshufb (%2), %%ymm0, %%ymm0 \n\t"
"vpermd %%ymm0, %%ymm2, %%ymm0 \n\t"
"vmovups %%ymm0, (%0) \n\t"
: /* no output operands -- implicitly volatile */
: "r" (c), "r" (b), "r" (__lifeperm)
: "xmm0", "xmm1", "xmm2", "memory" );
}
}
This diff is collapsed.
#include "iterators_b3s23.h"
#include "iterators_b36s23.h"
#include <iostream>
int main() {
uint32_t d[32];
uint32_t h[32];
for (int i = 0; i < 32; i++) {
d[i] = 0; h[i] = 0;
}
d[15] = 31 << 14;
d[16] = 17 << 14;
std::cout << "--------" << std::endl;
// clock_t start_time = clock();
for (int k = 0; k < 10; k++) {
for (int i = 8; i < 24; i++) {
uint32_t x = d[i];
int j = 32;
while (j --> 0) {
if (1 & (x >> j)) {
std::cout << "*";
} else {
std::cout << ".";
}
}
std::cout << std::endl;
}
b3s23::iterate_var_sse2(8, d);
// std::cout << "Value: " << apg::iter8_sse2(d, h) << std::endl;
}
for (int i = 0; i < 32; i++) {
d[i] = 0; h[i] = 0;
}
d[14] = 14 << 14;
d[15] = 1 << 14;
d[16] = 1 << 14;
d[17] = 1 << 14;
std::cout << "--------" << std::endl;
// clock_t start_time = clock();
for (int k = 0; k < 10; k++) {
for (int i = 8; i < 24; i++) {
uint32_t x = d[i];
int j = 32;
while (j --> 0) {
if (1 & (x >> j)) {
std::cout << "*";
} else {
std::cout << ".";
}
}
std::cout << std::endl;
}
b36s23::iterate_var_sse2(8, d);
// std::cout << "Value: " << apg::iter8_sse2(d, h) << std::endl;
}
// clock_t end_time = clock();
// std::cout << "iter8 in " << ((double) (end_time - start_time) / CLOCKS_PER_SEC) << " us." << std::endl;
return 0;
}
#include <iostream>
#include <stdint.h>
#include "lifeperm.h"
#include "eors.h"
int main() {
// std::cout << "Best instruction set: " << apg::best_instruction_set() << std::endl;
std::cout << "CPU name: " << apg::cpu_name() << std::endl;
uint64_t a[16];
for (int i = 0; i < 16; i++) {
a[i] = 0x0807060504030201ull + (0x0808080808080808ull * i);
}
uint32_t b1[32];
uint32_t b2[32];
uint32_t b3[32];
uint64_t c1[4];
uint64_t c2[4];
uint64_t c3[4];
apg::z64_to_r32_sse2(a, b1);
apg::z64_to_r32_sse2(a, b2);
// apg::z64_to_r32_avx2(a, b3);
for (int i = 0; i < 32; i++) {
uint32_t x = b1[i];
uint32_t y = b2[i];
// uint32_t z = b3[i];
for (int j = 0; j < 4; j++) {
std::cout << (x & 0xff) << ' '; x = x >> 8;
std::cout << (y & 0xff) << ' '; y = y >> 8;
// std::cout << (z & 0xff) << ' '; z = z >> 8;
}
std::cout << std::endl;
}
apg::r32_centre_to_z64_sse4(b1, c1);
apg::r32_centre_to_z64_ssse3(b2, c2);
// apg::r32_centre_to_z64_avx2(b3, c3);
for (int i = 0; i < 4; i++) {
uint64_t x = c1[i];
uint64_t y = c2[i];
// uint64_t z = c3[i];
for (int j = 0; j < 8; j++) {
std::cout << (x & 0xff) << ' '; x = x >> 8;
std::cout << (y & 0xff) << ' '; y = y >> 8;
// std::cout << (z & 0xff) << ' '; z = z >> 8;
}
std::cout << std::endl;
}
return 0;
}
#pragma once
#include <stdint.h>
namespace apg {
uint64_t upset(uint64_t x) {
uint64_t y = x;
y |= ((y & 0x5555555555555555) << 1);
y |= ((y & 0x3333333333333333) << 2);
y |= ((y & 0x0f0f0f0f0f0f0f0f) << 4);
y |= ((y & 0x00ff00ff00ff00ff) << 8);
y |= ((y & 0x0000ffff0000ffff) << 16);
y |= ((y & 0x00000000ffffffff) << 32);
return y;
}
void uint64_convolve2(uint64_t a, uint64_t b, uint64_t *out, bool exclusive) {
// Convolution (works best when b is sparse):
uint64_t brem = b;
while (brem) {
uint64_t c = (brem & (-brem)); // extract a single bit from brem
brem ^= c; // remove bit
uint64_t tzc = __builtin_ctzll(c); // determine shifts
uint64_t xs = tzc & 7;
uint64_t ys = tzc & 56;
uint64_t bitmask = (0x0101010101010101ull << xs) - 0x0101010101010101ull;
uint64_t right = (a >> (8 - xs)) & bitmask;
uint64_t left = (a << xs) & (~bitmask);
if (exclusive) {
out[0] ^= (left << ys);
out[1] ^= (right << ys);
if (ys) {
out[2] ^= (left >> (64 - ys));
out[3] ^= (right >> (64 - ys));
}
} else {
out[0] |= (left << ys);
out[1] |= (right << ys);
if (ys) {
out[2] |= (left >> (64 - ys));
out[3] |= (right >> (64 - ys));
}
}
}
}
void uint64_convolve(uint64_t a, uint64_t b, uint64_t *out, bool exclusive) {
// Convolve two 8-by-8 squares to produce a 16-by-16 result:
if (__builtin_popcountll(a) > __builtin_popcountll(b)) {
uint64_convolve2(a, b, out, exclusive);
} else {
uint64_convolve2(b, a, out, exclusive);
}
}
uint64_t uint64_bottom(uint64_t tile) {
uint64_t dy = 0;
if (tile & 0xff00000000000000ull) {
dy = 7;
} else if (tile & 0x00ff000000000000ull) {
dy = 6;
} else if (tile & 0x0000ff0000000000ull) {
dy = 5;
} else if (tile & 0x000000ff00000000ull) {
dy = 4;
} else if (tile & 0x00000000ff000000ull) {
dy = 3;
} else if (tile & 0x0000000000ff0000ull) {
dy = 2;
} else if (tile & 0x000000000000ff00ull) {
dy = 1;
}