Commit efa344cd authored by Adam P. Goucher's avatar Adam P. Goucher

10 percent speed improvement from handwritten AVX2 assembly code

parent 2ea6081c
Pipeline #50226576 passed with stages
in 8 minutes and 10 seconds
#ifndef LIFELIB_VERSION /*
__version__=[x.replace('"', '') for x in '''
*/
#define LIFELIB_VERSION "ll2.1.20"
#define LIFELIB_VERSION "ll2.1.21"
// '''.split() if ('ll' in x)][0][2:]
#endif
......@@ -132,7 +132,7 @@
_copyBoundary5<H, K>(d, n->d);
}
#ifdef __AVX512F__
#ifdef __AVX2__
void copyBoundary3(VTile<H,K> *n);
void copyBoundary0(VTile<H,K> *n);
#else
......
......@@ -49,6 +49,88 @@
: "r" (d), "r" (n->d), "r" (apg::__16x3fffffff)
: "xmm6", "xmm8", "xmm13", "xmm14", "memory");
}
#else
#ifdef __AVX2__
template<>
void VTile<28>::copyBoundary3(VTile<28> *n) {
asm (
"vmovdqu 8(%0), %%ymm6 \n\t"
"vmovdqu 8(%1), %%ymm8 \n\t"
"vmovdqu (%2), %%ymm14 \n\t"
"vpsrld $28, %%ymm8, %%ymm8 \n\t"
"vpand %%ymm14, %%ymm6, %%ymm6 \n\t"
"vpandn %%ymm8, %%ymm14, %%ymm8 \n\t"
"vpor %%ymm8, %%ymm6, %%ymm6 \n\t"
"vmovdqu %%ymm6, 8(%0) \n\t"
"vmovdqu 40(%0), %%ymm6 \n\t"
"vmovdqu 40(%1), %%ymm8 \n\t"
"vpsrld $28, %%ymm8, %%ymm8 \n\t"
"vpand %%ymm14, %%ymm6, %%ymm6 \n\t"
"vpandn %%ymm8, %%ymm14, %%ymm8 \n\t"
"vpor %%ymm8, %%ymm6, %%ymm6 \n\t"
"vmovdqu %%ymm6, 40(%0) \n\t"
"vmovdqu 72(%0), %%ymm6 \n\t"
"vmovdqu 72(%1), %%ymm8 \n\t"
"vpsrld $28, %%ymm8, %%ymm8 \n\t"
"vpand %%ymm14, %%ymm6, %%ymm6 \n\t"
"vpandn %%ymm8, %%ymm14, %%ymm8 \n\t"
"vpor %%ymm8, %%ymm6, %%ymm6 \n\t"
"vmovdqu %%ymm6, 72(%0) \n\t"
"vmovdqu 104(%0), %%xmm6 \n\t"
"vmovdqu 104(%1), %%xmm8 \n\t"
"vpsrld $28, %%xmm8, %%xmm8 \n\t"
"vpand %%xmm14, %%xmm6, %%xmm6 \n\t"
"vpandn %%xmm8, %%xmm14, %%xmm8 \n\t"
"vpor %%xmm8, %%xmm6, %%xmm6 \n\t"
"vmovdqu %%xmm6, 104(%0) \n\t"
: /* no output operands */
: "r" (d), "r" (n->d), "r" (apg::__16xfffffffc)
: "xmm6", "xmm8", "xmm13", "xmm14", "memory");
}
template<>
void VTile<28>::copyBoundary0(VTile<28> *n) {
asm (
"vmovdqu 8(%0), %%ymm6 \n\t"
"vmovdqu 8(%1), %%ymm8 \n\t"
"vmovdqu (%2), %%ymm14 \n\t"
"vpslld $28, %%ymm8, %%ymm8 \n\t"
"vpand %%ymm14, %%ymm6, %%ymm6 \n\t"
"vpandn %%ymm8, %%ymm14, %%ymm8 \n\t"
"vpor %%ymm8, %%ymm6, %%ymm6 \n\t"
"vmovdqu %%ymm6, 8(%0) \n\t"
"vmovdqu 40(%0), %%ymm6 \n\t"
"vmovdqu 40(%1), %%ymm8 \n\t"
"vpslld $28, %%ymm8, %%ymm8 \n\t"
"vpand %%ymm14, %%ymm6, %%ymm6 \n\t"
"vpandn %%ymm8, %%ymm14, %%ymm8 \n\t"
"vpor %%ymm8, %%ymm6, %%ymm6 \n\t"
"vmovdqu %%ymm6, 40(%0) \n\t"
"vmovdqu 72(%0), %%ymm6 \n\t"
"vmovdqu 72(%1), %%ymm8 \n\t"
"vpslld $28, %%ymm8, %%ymm8 \n\t"
"vpand %%ymm14, %%ymm6, %%ymm6 \n\t"
"vpandn %%ymm8, %%ymm14, %%ymm8 \n\t"
"vpor %%ymm8, %%ymm6, %%ymm6 \n\t"
"vmovdqu %%ymm6, 72(%0) \n\t"
"vmovdqu 104(%0), %%xmm6 \n\t"
"vmovdqu 104(%1), %%xmm8 \n\t"
"vpslld $28, %%xmm8, %%xmm8 \n\t"
"vpand %%xmm14, %%xmm6, %%xmm6 \n\t"
"vpandn %%xmm8, %%xmm14, %%xmm8 \n\t"
"vpor %%xmm8, %%xmm6, %%xmm6 \n\t"
"vmovdqu %%xmm6, 104(%0) \n\t"
: /* no output operands */
: "r" (d), "r" (n->d), "r" (apg::__16x3fffffff)
: "xmm6", "xmm8", "xmm13", "xmm14", "memory");
}
#endif
#endif
template<>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment