Commit 300dd11c authored by Adam P. Goucher's avatar Adam P. Goucher

Need special implementations to ensure correct vectorisation

parent 134e99ba
Pipeline #49162749 passed with stages
in 7 minutes and 50 seconds
......@@ -132,6 +132,10 @@
_copyBoundary5<H, K>(d, n->d);
}
#ifdef __AVX512F__
void copyBoundary3(VTile<H,K> *n);
void copyBoundary0(VTile<H,K> *n);
#else
void copyBoundary3(VTile<H,K> *n) {
_copyBoundary3<H, K>(d, n->d);
}
......@@ -139,6 +143,7 @@
void copyBoundary0(VTile<H,K> *n) {
_copyBoundary0<H, K>(d, n->d);
}
#endif
inline void updateTile(upattern<VTile<H,K>, 32 - 2*K, H>* owner, int rule, int family, int mantissa) __attribute__((always_inline));
// ^^^ we really do need both the prefix 'inline' and the attribute 'always_inline' for this to work ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
......
#ifdef __AVX512F__
template<>
void VTile<28>::copyBoundary3(VTile<28> *n) {
asm (
"vmovdqu64 8(%0), %%zmm6 \n\t"
"vmovdqu64 8(%1), %%zmm8 \n\t"
"vmovdqu64 (%2), %%zmm14 \n\t"
"vpsrld $28, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vmovdqu64 %%zmm6, 8(%0) \n\t"
"vmovdqu 72(%0), %%ymm6 \n\t"
"vmovdqu 104(%0), %%xmm13 \n\t"
"vshufi32x4 $68, %%zmm13, %%zmm6, %%zmm6 \n\t"
"vmovdqu 72(%1), %%ymm8 \n\t"
"vmovdqu 104(%1), %%xmm13 \n\t"
"vshufi32x4 $68, %%zmm13, %%zmm8, %%zmm8 \n\t"
"vpsrld $28, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vshufi32x4 $78, %%zmm6, %%zmm6, %%zmm13 \n\t"
"vmovdqu %%ymm6, 72(%0) \n\t"
"vmovdqu %%xmm13, 104(%0) \n\t"
: /* no output operands */
: "r" (d), "r" (n->d), "r" (apg::__16xfffffffc)
: "xmm6", "xmm8", "xmm13", "xmm14", "memory");
}
template<>
void VTile<28>::copyBoundary0(VTile<28> *n) {
asm (
"vmovdqu64 8(%0), %%zmm6 \n\t"
"vmovdqu64 8(%1), %%zmm8 \n\t"
"vmovdqu64 (%2), %%zmm14 \n\t"
"vpslld $28, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vmovdqu64 %%zmm6, 8(%0) \n\t"
"vmovdqu 72(%0), %%ymm6 \n\t"
"vmovdqu 104(%0), %%xmm13 \n\t"
"vshufi32x4 $68, %%zmm13, %%zmm6, %%zmm6 \n\t"
"vmovdqu 72(%1), %%ymm8 \n\t"
"vmovdqu 104(%1), %%xmm13 \n\t"
"vshufi32x4 $68, %%zmm13, %%zmm8, %%zmm8 \n\t"
"vpslld $28, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vshufi32x4 $78, %%zmm6, %%zmm6, %%zmm13 \n\t"
"vmovdqu %%ymm6, 72(%0) \n\t"
"vmovdqu %%xmm13, 104(%0) \n\t"
: /* no output operands */
: "r" (d), "r" (n->d), "r" (apg::__16x3fffffff)
: "xmm6", "xmm8", "xmm13", "xmm14", "memory");
}
#endif
template<>
inline void VTile<28>::updateTile(upattern<VTile<28>, 28, 28>* owner, int rule, int family, int mantissa) {
......
#ifdef __AVX512F__
template<>
void VTile<44>::copyBoundary3(VTile<44> *n) {
asm (
"vmovdqu64 8(%0), %%zmm6 \n\t"
"vmovdqu64 8(%1), %%zmm8 \n\t"
"vmovdqu64 (%2), %%zmm14 \n\t"
"vpsrld $28, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vmovdqu64 %%zmm6, 8(%0) \n\t"
"vmovdqu64 72(%0), %%zmm6 \n\t"
"vmovdqu64 72(%1), %%zmm8 \n\t"
"vpsrld $28, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vmovdqu64 %%zmm6, 72(%0) \n\t"
"vmovdqu 136(%0), %%ymm6 \n\t"
"vmovdqu 168(%0), %%xmm13 \n\t"
"vshufi32x4 $68, %%zmm13, %%zmm6, %%zmm6 \n\t"
"vmovdqu 136(%1), %%ymm8 \n\t"
"vmovdqu 168(%1), %%xmm13 \n\t"
"vshufi32x4 $68, %%zmm13, %%zmm8, %%zmm8 \n\t"
"vpsrld $28, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vshufi32x4 $78, %%zmm6, %%zmm6, %%zmm13 \n\t"
"vmovdqu %%ymm6, 136(%0) \n\t"
"vmovdqu %%xmm13, 168(%0) \n\t"
: /* no output operands */
: "r" (d), "r" (n->d), "r" (apg::__16xfffffffc)
: "xmm6", "xmm8", "xmm13", "xmm14", "memory");
}
template<>
void VTile<44>::copyBoundary0(VTile<44> *n) {
asm (
"vmovdqu64 8(%0), %%zmm6 \n\t"
"vmovdqu64 8(%1), %%zmm8 \n\t"
"vmovdqu64 (%2), %%zmm14 \n\t"
"vpslld $28, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vmovdqu64 %%zmm6, 8(%0) \n\t"
"vmovdqu64 72(%0), %%zmm6 \n\t"
"vmovdqu64 72(%1), %%zmm8 \n\t"
"vpslld $28, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vmovdqu64 %%zmm6, 72(%0) \n\t"
"vmovdqu 136(%0), %%ymm6 \n\t"
"vmovdqu 168(%0), %%xmm13 \n\t"
"vshufi32x4 $68, %%zmm13, %%zmm6, %%zmm6 \n\t"
"vmovdqu 136(%1), %%ymm8 \n\t"
"vmovdqu 168(%1), %%xmm13 \n\t"
"vshufi32x4 $68, %%zmm13, %%zmm8, %%zmm8 \n\t"
"vpslld $28, %%zmm8, %%zmm8 \n\t"
"vpternlogd $228, %%zmm14, %%zmm8, %%zmm6 \n\t"
"vshufi32x4 $78, %%zmm6, %%zmm6, %%zmm13 \n\t"
"vmovdqu %%ymm6, 136(%0) \n\t"
"vmovdqu %%xmm13, 168(%0) \n\t"
: /* no output operands */
: "r" (d), "r" (n->d), "r" (apg::__16x3fffffff)
: "xmm6", "xmm8", "xmm13", "xmm14", "memory");
}
#endif
template<>
inline void VTile<44>::updateTile(upattern<VTile<44>, 28, 44>* owner, int rule, int family, int mantissa) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment