Commit 8e362a45 authored by Adam P. Goucher's avatar Adam P. Goucher

Another 1.5% performance improvement on AVX-512

parent 607d29c3
Pipeline #47812205 passed with stages
in 7 minutes and 49 seconds
......@@ -46,7 +46,7 @@
int iterate_var_48_28(uint32_t* d, uint32_t* diffs) {
uint32_t e[48];
return iterate_avx512_48_28(d, e, 0, 0, diffs, false);
return iterate_avx512_48_28_monolith(d, e, diffs);
}
int iterate_var_48_28(uint32_t* d, uint32_t* h, uint32_t* diffs) {
......
......@@ -8428,6 +8428,192 @@ namespace b3s23 {
#ifdef __AVX512F__
bool iterate_avx512_48_28_monolith(uint32_t * __restrict__ d, uint32_t * __restrict__ e, uint32_t * __restrict__ diffs) {
asm (
"vmovdqu64 (%2), %%zmm14 \n\t"
"vmovdqu64 64(%2), %%zmm16 \n\t"
"vmovdqu64 128(%2), %%zmm17 \n\t"
"vmovdqu64 (%0), %%zmm7 \n\t"
"vpsrld $1, %%zmm7, %%zmm6 \n\t"
"vpslld $1, %%zmm7, %%zmm1 \n\t"
"vpxord %%zmm6, %%zmm1, %%zmm9 \n\t"
"vmovdqa32 %%zmm7, %%zmm12 \n\t"
"vpternlogd $232, %%zmm6, %%zmm1, %%zmm12 \n\t"
"vpxord %%zmm7, %%zmm9, %%zmm11 \n\t"
"vmovdqu64 64(%0), %%zmm0 \n\t"
"vpsrld $1, %%zmm0, %%zmm6 \n\t"
"vpslld $1, %%zmm0, %%zmm1 \n\t"
"vpxord %%zmm6, %%zmm1, %%zmm2 \n\t"
"vmovdqa32 %%zmm0, %%zmm5 \n\t"
"vpternlogd $232, %%zmm6, %%zmm1, %%zmm5 \n\t"
"vpxord %%zmm0, %%zmm2, %%zmm4 \n\t"
"vmovdqa64 %%zmm16, %%zmm18 \n\t"
"vmovdqa64 %%zmm17, %%zmm6 \n\t"
"vmovdqa64 %%zmm17, %%zmm8 \n\t"
"vmovdqa64 %%zmm16, %%zmm20 \n\t"
"vmovdqa64 %%zmm16, %%zmm19 \n\t"
"vpermi2d %%zmm2, %%zmm9, %%zmm18 \n\t"
"vpermi2d %%zmm4, %%zmm11, %%zmm6 \n\t"
"vpermi2d %%zmm5, %%zmm12, %%zmm8 \n\t"
"vpermi2d %%zmm0, %%zmm7, %%zmm20 \n\t"
"vpermi2d %%zmm5, %%zmm12, %%zmm19 \n\t"
"vpternlogd $150, %%zmm18, %%zmm11, %%zmm6 \n\t"
"vpternlogd $178, %%zmm18, %%zmm6, %%zmm11 \n\t"
"vpternlogd $166, %%zmm11, %%zmm19, %%zmm18 \n\t"
"vpternlogd $22, %%zmm18, %%zmm12, %%zmm8 \n\t"
"vpternlogd $162, %%zmm8, %%zmm11, %%zmm18 \n\t"
"vpternlogd $168, %%zmm18, %%zmm20, %%zmm6 \n\t"
"vmovdqa64 %%zmm6, %%zmm24 \n\t"
"vmovdqu64 128(%0), %%zmm7 \n\t"
"vpsrld $1, %%zmm7, %%zmm6 \n\t"
"vpslld $1, %%zmm7, %%zmm1 \n\t"
"vpxord %%zmm6, %%zmm1, %%zmm9 \n\t"
"vmovdqa32 %%zmm7, %%zmm12 \n\t"
"vpternlogd $232, %%zmm6, %%zmm1, %%zmm12 \n\t"
"vpxord %%zmm7, %%zmm9, %%zmm11 \n\t"
"vmovdqa64 %%zmm16, %%zmm18 \n\t"
"vmovdqa64 %%zmm17, %%zmm6 \n\t"
"vmovdqa64 %%zmm17, %%zmm8 \n\t"
"vmovdqa64 %%zmm16, %%zmm20 \n\t"
"vmovdqa64 %%zmm16, %%zmm19 \n\t"
"vpermi2d %%zmm9, %%zmm2, %%zmm18 \n\t"
"vpermi2d %%zmm11, %%zmm4, %%zmm6 \n\t"
"vpermi2d %%zmm12, %%zmm5, %%zmm8 \n\t"
"vpermi2d %%zmm7, %%zmm0, %%zmm20 \n\t"
"vpermi2d %%zmm12, %%zmm5, %%zmm19 \n\t"
"vpternlogd $150, %%zmm18, %%zmm4, %%zmm6 \n\t"
"vpternlogd $178, %%zmm18, %%zmm6, %%zmm4 \n\t"
"vpternlogd $166, %%zmm4, %%zmm19, %%zmm18 \n\t"
"vpternlogd $22, %%zmm18, %%zmm5, %%zmm8 \n\t"
"vpternlogd $162, %%zmm8, %%zmm4, %%zmm18 \n\t"
"vpternlogd $168, %%zmm18, %%zmm20, %%zmm6 \n\t"
"vmovdqa64 %%zmm6, %%zmm25 \n\t"
"vmovdqa64 %%zmm16, %%zmm18 \n\t"
"vmovdqa64 %%zmm17, %%zmm6 \n\t"
"vmovdqa64 %%zmm17, %%zmm8 \n\t"
"vmovdqa64 %%zmm16, %%zmm20 \n\t"
"vmovdqa64 %%zmm16, %%zmm19 \n\t"
"vpermi2d %%zmm2, %%zmm9, %%zmm18 \n\t"
"vpermi2d %%zmm4, %%zmm11, %%zmm6 \n\t"
"vpermi2d %%zmm5, %%zmm12, %%zmm8 \n\t"
"vpermi2d %%zmm0, %%zmm7, %%zmm20 \n\t"
"vpermi2d %%zmm5, %%zmm12, %%zmm19 \n\t"
"vpternlogd $150, %%zmm18, %%zmm11, %%zmm6 \n\t"
"vpternlogd $178, %%zmm18, %%zmm6, %%zmm11 \n\t"
"vpternlogd $166, %%zmm11, %%zmm19, %%zmm18 \n\t"
"vpternlogd $22, %%zmm18, %%zmm12, %%zmm8 \n\t"
"vpternlogd $162, %%zmm8, %%zmm11, %%zmm18 \n\t"
"vpternlogd $168, %%zmm18, %%zmm20, %%zmm6 \n\t"
"vmovdqa64 %%zmm6, %%zmm26 \n\t"
"vmovdqa64 %%zmm24, %%zmm7 \n\t"
"vpsrld $1, %%zmm7, %%zmm6 \n\t"
"vpslld $1, %%zmm7, %%zmm1 \n\t"
"vpxord %%zmm6, %%zmm1, %%zmm9 \n\t"
"vmovdqa32 %%zmm7, %%zmm12 \n\t"
"vpternlogd $232, %%zmm6, %%zmm1, %%zmm12 \n\t"
"vpxord %%zmm7, %%zmm9, %%zmm11 \n\t"
"vmovdqa64 %%zmm25, %%zmm0 \n\t"
"vpsrld $1, %%zmm0, %%zmm6 \n\t"
"vpslld $1, %%zmm0, %%zmm1 \n\t"
"vpxord %%zmm6, %%zmm1, %%zmm2 \n\t"
"vmovdqa32 %%zmm0, %%zmm5 \n\t"
"vpternlogd $232, %%zmm6, %%zmm1, %%zmm5 \n\t"
"vpxord %%zmm0, %%zmm2, %%zmm4 \n\t"
"vmovdqa64 %%zmm16, %%zmm18 \n\t"
"vmovdqa64 %%zmm17, %%zmm6 \n\t"
"vmovdqa64 %%zmm17, %%zmm8 \n\t"
"vmovdqa64 %%zmm16, %%zmm20 \n\t"
"vmovdqa64 %%zmm16, %%zmm19 \n\t"
"vpermi2d %%zmm2, %%zmm9, %%zmm18 \n\t"
"vpermi2d %%zmm4, %%zmm11, %%zmm6 \n\t"
"vpermi2d %%zmm5, %%zmm12, %%zmm8 \n\t"
"vpermi2d %%zmm0, %%zmm7, %%zmm20 \n\t"
"vpermi2d %%zmm5, %%zmm12, %%zmm19 \n\t"
"vpternlogd $150, %%zmm18, %%zmm11, %%zmm6 \n\t"
"vpternlogd $178, %%zmm18, %%zmm6, %%zmm11 \n\t"
"vpternlogd $166, %%zmm11, %%zmm19, %%zmm18 \n\t"
"vpternlogd $22, %%zmm18, %%zmm12, %%zmm8 \n\t"
"vpternlogd $162, %%zmm8, %%zmm11, %%zmm18 \n\t"
"vpternlogd $168, %%zmm18, %%zmm20, %%zmm6 \n\t"
"vmovdqu64 8(%0), %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vmovdqu64 %%zmm6, 8(%0) \n\t"
"vpxord %%zmm6, %%zmm8, %%zmm15 \n\t"
"vmovdqu64 %%zmm15, (%1) \n\t"
"vmovdqa64 %%zmm26, %%zmm7 \n\t"
"vpsrld $1, %%zmm7, %%zmm6 \n\t"
"vpslld $1, %%zmm7, %%zmm1 \n\t"
"vpxord %%zmm6, %%zmm1, %%zmm9 \n\t"
"vmovdqa32 %%zmm7, %%zmm12 \n\t"
"vpternlogd $232, %%zmm6, %%zmm1, %%zmm12 \n\t"
"vpxord %%zmm7, %%zmm9, %%zmm11 \n\t"
"vmovdqa64 %%zmm16, %%zmm18 \n\t"
"vmovdqa64 %%zmm17, %%zmm6 \n\t"
"vmovdqa64 %%zmm17, %%zmm8 \n\t"
"vmovdqa64 %%zmm16, %%zmm20 \n\t"
"vmovdqa64 %%zmm16, %%zmm19 \n\t"
"vpermi2d %%zmm9, %%zmm2, %%zmm18 \n\t"
"vpermi2d %%zmm11, %%zmm4, %%zmm6 \n\t"
"vpermi2d %%zmm12, %%zmm5, %%zmm8 \n\t"
"vpermi2d %%zmm7, %%zmm0, %%zmm20 \n\t"
"vpermi2d %%zmm12, %%zmm5, %%zmm19 \n\t"
"vpternlogd $150, %%zmm18, %%zmm4, %%zmm6 \n\t"
"vpternlogd $178, %%zmm18, %%zmm6, %%zmm4 \n\t"
"vpternlogd $166, %%zmm4, %%zmm19, %%zmm18 \n\t"
"vpternlogd $22, %%zmm18, %%zmm5, %%zmm8 \n\t"
"vpternlogd $162, %%zmm8, %%zmm4, %%zmm18 \n\t"
"vpternlogd $168, %%zmm18, %%zmm20, %%zmm6 \n\t"
"vmovdqu64 72(%0), %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vmovdqu64 %%zmm6, 72(%0) \n\t"
"vpternlogd $246, %%zmm6, %%zmm8, %%zmm15 \n\t"
"vmovdqa64 %%zmm16, %%zmm18 \n\t"
"vmovdqa64 %%zmm17, %%zmm6 \n\t"
"vmovdqa64 %%zmm17, %%zmm8 \n\t"
"vmovdqa64 %%zmm16, %%zmm20 \n\t"
"vmovdqa64 %%zmm16, %%zmm19 \n\t"
"vpermi2d %%zmm2, %%zmm9, %%zmm18 \n\t"
"vpermi2d %%zmm4, %%zmm11, %%zmm6 \n\t"
"vpermi2d %%zmm5, %%zmm12, %%zmm8 \n\t"
"vpermi2d %%zmm0, %%zmm7, %%zmm20 \n\t"
"vpermi2d %%zmm5, %%zmm12, %%zmm19 \n\t"
"vpternlogd $150, %%zmm18, %%zmm11, %%zmm6 \n\t"
"vpternlogd $178, %%zmm18, %%zmm6, %%zmm11 \n\t"
"vpternlogd $166, %%zmm11, %%zmm19, %%zmm18 \n\t"
"vpternlogd $22, %%zmm18, %%zmm12, %%zmm8 \n\t"
"vpternlogd $162, %%zmm8, %%zmm11, %%zmm18 \n\t"
"vpternlogd $168, %%zmm18, %%zmm20, %%zmm6 \n\t"
"vmovdqu 136(%0), %%ymm8 \n\t"
"vmovdqu 168(%0), %%xmm13 \n\t"
"vshufi32x4 $68, %%zmm13, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vshufi32x4 $78, %%zmm6, %%zmm6, %%zmm13 \n\t"
"vmovdqu %%ymm6, 136(%0) \n\t"
"vmovdqu %%xmm13, 168(%0) \n\t"
"vpxord %%zmm6, %%zmm8, %%zmm8 \n\t"
"vshufi32x4 $78, %%zmm15, %%zmm15, %%zmm13 \n\t"
"vpord %%zmm13, %%zmm15, %%zmm15 \n\t"
"vshufi32x4 $78, %%zmm8, %%zmm8, %%zmm13 \n\t"
"vpord %%ymm8, %%ymm15, %%ymm15 \n\t"
"vmovdqu %%ymm15, 64(%1) \n\t"
"vmovdqu %%xmm13, 96(%1) \n\t"
: /* no output operands */
: "r" (d), "r" (e), "r" (apg::__sixteen28)
: "ebx", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
"xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10",
"xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "xmm16",
"xmm17", "xmm18", "xmm19", "xmm20", "memory");
uint64_t* e64 = ((uint64_t*) e);
uint64_t bigdiff = e64[8] | e64[9] | e64[10] | e64[11] | e64[12] | e64[13];
if (diffs != 0) {
diffs[0] = (bigdiff | (bigdiff >> 32));
diffs[1] = e[0] | e[1];
diffs[2] = e[26] | e[27];
}
return (bigdiff == 0);
}
bool iterate_avx512_48_28(uint32_t * __restrict__ d, uint32_t * __restrict__ e, uint32_t * __restrict__ h, uint32_t * __restrict__ j, uint32_t * __restrict__ diffs, bool onegen) {
if (h) {
for (int i = 0; i < 48; i++) {
......
......@@ -25,6 +25,7 @@ def create_rule(rulestring):
if 'avx512' in iset:
f.write('\n\n#ifdef __AVX512F__\n\n')
iw.write_monolith(rulestring, 48, 28)
iw.write_function(rulestring, 48, 28)
iw.write_function(rulestring, 32, 28)
......@@ -43,15 +44,19 @@ def create_rule(rulestring):
class b3s23writer(iwriter_base):
def load_and_hshift(self, i, oddgen, terminal):
def load_and_hshift(self, i, oddgen, terminal, from_reg=False):
d = '(%1)' if (oddgen) else '(%0)'
if ('avx512' in self.iset):
n = [64, 48, 32, 16, 64][int(terminal)]
self.read16n(n, d, 64 * i, 7 - 7 * (i % 2))
regname = '%%zmm'
inreg = regname + str(7 - 7 * (i % 2))
if from_reg:
oldreg = regname + str(24 + i)
self.printinstr('vmovdqa64 %s, %s' % (oldreg, inreg))
else:
self.read16n(n, d, 64 * i, 7 - 7 * (i % 2))
else:
regbytes = 32 if ('avx2' in self.iset) else 16
regname = '%%ymm' if (('avx2' in self.iset) and not terminal) else '%%xmm'
......@@ -157,7 +162,7 @@ class b3s23writer(iwriter_base):
self.logicgate('pand', 0 + 7 * (i % 2), 6, 6)
def save_result_avx512(self, i, oddgen, terminal, diff=False):
def save_result_avx512(self, i, oddgen, terminal, diff=False, to_reg=False):
n = [64, 48, 32, 16, 64][int(terminal)]
......@@ -185,6 +190,9 @@ class b3s23writer(iwriter_base):
self.write16n(32, 15, 32, '(%1)')
else:
self.trogicgate(0b11110110, 6, 8, 15)
elif to_reg:
oldreg = '%%zmm' + str(23 + i)
self.printinstr('vmovdqa64 %s, %s' % ('%%zmm6', oldreg))
else:
self.write16n(n, 6, offset, e)
......@@ -237,10 +245,11 @@ class b3s23writer(iwriter_base):
self.f.write('\n' + (' ' * 20))
self.f.write('"memory");\n\n')
def assemble(self, rulestring, oddgen, rowcount, dwidth):
def assemble(self, rulestring, oddgen, rowcount, dwidth, regtemp=False):
self.prologue()
self.preparethings(dwidth)
if (not regtemp) or (oddgen == 0):
self.prologue()
self.preparethings(dwidth)
rpr = 16 if ('avx512' in self.iset) else (8 if ('avx2' in self.iset) else 4)
......@@ -251,7 +260,7 @@ class b3s23writer(iwriter_base):
if (i < riters):
terminal = max(0, ((i + 1) * rpr - rowcount) // 4)
self.load_and_hshift(i, oddgen, terminal)
self.load_and_hshift(i, oddgen, terminal, from_reg=(regtemp and oddgen))
self.horizontal_adders(i)
if (i > 0):
......@@ -270,9 +279,56 @@ class b3s23writer(iwriter_base):
diff = False
if 'avx512' in self.iset:
self.save_result_avx512(i, oddgen, terminal, diff)
self.save_result_avx512(i, oddgen, terminal, diff, to_reg=(regtemp and not oddgen))
else:
self.save_result(i, oddgen, terminal, diff)
self.epilogue(dwidth)
if (not regtemp) or (oddgen == 1):
self.epilogue(dwidth)
def write_monolith(self, rulestring, rowcount, dwidth):
name = 'iterate_%s_%d_%d_monolith' % (self.besti, rowcount, dwidth)
params = 'uint32_t * __restrict__ diffs'
for i in 'ed':
params = 'uint32_t * __restrict__ ' + i + ', ' + params
self.f.write(' bool %s(%s) {\n' % (name, params))
self.assemble(rulestring, 0, rowcount, dwidth, regtemp=True)
self.assemble(rulestring, 1, rowcount, dwidth, regtemp=True)
if 'avx512' in self.iset:
newrows = rowcount - 4
while (newrows >= 32):
newrows -= 16
if (newrows <= 16):
bindices = list(range(0, newrows // 2))
elif (newrows == 28):
bindices = [8, 9, 10, 11, 12, 13]
else:
bindices = list(range(4, newrows // 2))
lindices = list(range(newrows - 2, newrows))
elif 'avx2' in self.iset:
bindices = [4, 5, 6, 7]
lindices = list(range(16 + ((rowcount - 6) % 8), 18 + ((rowcount - 6) % 8)))
else:
bindices = [2, 3]
lindices = list(range(8 + ((rowcount - 6) % 4), 10 + ((rowcount - 6) % 4)))
self.f.write(' uint64_t* e64 = ((uint64_t*) e);\n')
self.f.write(' uint64_t bigdiff = %s;\n' % (' | '.join(['e64[%d]' % x for x in bindices])))
self.f.write(' if (diffs != 0) {\n')
self.f.write(' diffs[0] = (bigdiff | (bigdiff >> 32));\n')
self.f.write(' diffs[1] = e[0] | e[1];\n')
self.f.write(' diffs[2] = %s;\n' % (' | '.join(['e[%d]' % x for x in lindices])))
self.f.write(' }\n')
self.f.write(' return (bigdiff == 0);\n')
self.f.write(' }\n\n')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment