Commit b11381f4 authored by Adam P. Goucher's avatar Adam P. Goucher

Vectorise disjunction to yield 1 percent speed improvement on AVX-512

parent 72541327
Pipeline #47075266 passed with stages
in 7 minutes and 48 seconds
......@@ -8630,9 +8630,9 @@ namespace b3s23 {
"vpxord %%zmm6, %%zmm8, %%zmm8 \n\t"
"vshufi32x4 $78, %%zmm15, %%zmm15, %%zmm13 \n\t"
"vpord %%zmm13, %%zmm15, %%zmm15 \n\t"
"vmovdqu %%ymm15, 32(%1) \n\t"
"vshufi32x4 $78, %%zmm8, %%zmm8, %%zmm13 \n\t"
"vmovdqu %%ymm8, 64(%1) \n\t"
"vpord %%ymm8, %%ymm15, %%ymm15 \n\t"
"vmovdqu %%ymm15, 64(%1) \n\t"
"vmovdqu %%xmm13, 96(%1) \n\t"
: /* no output operands */
: "r" (d), "r" (e), "r" (apg::__sixteen28)
......@@ -8652,7 +8652,7 @@ namespace b3s23 {
}
}
uint64_t* e64 = ((uint64_t*) e);
uint64_t bigdiff = e64[4] | e64[5] | e64[6] | e64[7] | e64[8] | e64[9] | e64[10] | e64[11] | e64[12] | e64[13];
uint64_t bigdiff = e64[8] | e64[9] | e64[10] | e64[11] | e64[12] | e64[13];
if (diffs != 0) {
diffs[0] = (bigdiff | (bigdiff >> 32));
diffs[1] = e[0] | e[1];
......@@ -8812,9 +8812,9 @@ namespace b3s23 {
"vpxord %%zmm6, %%zmm8, %%zmm8 \n\t"
"vshufi32x4 $78, %%zmm15, %%zmm15, %%zmm13 \n\t"
"vpord %%zmm13, %%zmm15, %%zmm15 \n\t"
"vmovdqu %%ymm15, 32(%1) \n\t"
"vshufi32x4 $78, %%zmm8, %%zmm8, %%zmm13 \n\t"
"vmovdqu %%ymm8, 64(%1) \n\t"
"vpord %%ymm8, %%ymm15, %%ymm15 \n\t"
"vmovdqu %%ymm15, 64(%1) \n\t"
"vmovdqu %%xmm13, 96(%1) \n\t"
: /* no output operands */
: "r" (d), "r" (e), "r" (apg::__sixteen28)
......@@ -8834,7 +8834,7 @@ namespace b3s23 {
}
}
uint64_t* e64 = ((uint64_t*) e);
uint64_t bigdiff = e64[4] | e64[5] | e64[6] | e64[7] | e64[8] | e64[9] | e64[10] | e64[11] | e64[12] | e64[13];
uint64_t bigdiff = e64[8] | e64[9] | e64[10] | e64[11] | e64[12] | e64[13];
if (diffs != 0) {
diffs[0] = (bigdiff | (bigdiff >> 32));
diffs[1] = e[0] | e[1];
......@@ -8996,8 +8996,8 @@ namespace b3s23 {
"vpxord %%zmm6, %%zmm8, %%zmm8 \n\t"
"vshufi32x4 $78, %%zmm15, %%zmm15, %%zmm13 \n\t"
"vpord %%zmm13, %%zmm15, %%zmm15 \n\t"
"vmovdqu %%ymm15, 32(%1) \n\t"
"vmovdqu %%ymm8, 64(%1) \n\t"
"vmovdqu %%ymm15, 32(%1) \n\t"
: /* no output operands */
: "r" (d), "r" (e), "r" (apg::__sixteen24)
: "ebx", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
......@@ -9172,8 +9172,8 @@ namespace b3s23 {
"vpxord %%zmm6, %%zmm8, %%zmm8 \n\t"
"vshufi32x4 $78, %%zmm15, %%zmm15, %%zmm13 \n\t"
"vpord %%zmm13, %%zmm15, %%zmm15 \n\t"
"vmovdqu %%ymm15, 32(%1) \n\t"
"vmovdqu %%xmm8, 64(%1) \n\t"
"vmovdqu %%ymm15, 32(%1) \n\t"
: /* no output operands */
: "r" (d), "r" (e), "r" (apg::__sixteen20)
: "ebx", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
......
......@@ -194,6 +194,8 @@ class iwriter_base(object):
if (newrows <= 16):
bindices = list(range(0, newrows // 2))
elif (newrows == 28):
bindices = [8, 9, 10, 11, 12, 13]
else:
bindices = list(range(4, newrows // 2))
lindices = list(range(newrows - 2, newrows))
......
......@@ -175,8 +175,14 @@ class b3s23writer(iwriter_base):
self.logicgate('pxord', 6, 8, 8)
self.printinstr('vshufi32x4 $78, %s, %s, %s' % ('%%zmm15', '%%zmm15', '%%zmm13'))
self.logicgate('pord', 13, 15, 15)
self.write16n(32, 15, 32, '(%1)')
self.write16n(n, 8, 64, '(%1)')
if (n == 48):
self.printinstr('vshufi32x4 $78, %s, %s, %s' % ('%%zmm8', '%%zmm8', '%%zmm13'))
self.printinstr("vpord %%ymm8, %%ymm15, %%ymm15")
self.write16n(32, 15, 64, '(%1)')
self.write16n(16, 13, 96, '(%1)')
else:
self.write16n( n, 8, 64, '(%1)')
self.write16n(32, 15, 32, '(%1)')
else:
self.trogicgate(0b11110110, 6, 8, 15)
else:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment