SSE2 Index* and Compare* for i386.
Use function pointers that at first point at dispatching functions that analyze CPU features and change the pointer to a more appropriate version for the rest of the eternity. Further tweak by inlining original functions under {$if defined(cpui386)}
(so that if other platforms decide to do the same, they could extend the condition). This is a trick I’ve been using for my batch processing or mathematical functions as of late, to great results like 10–15× faster matrix inverse. :’D
I’d want an AVX2 IndexQWord
for x64
(I like searching pointers a lot... and maybe certain other functions, but the special thing about IndexQWord
is that it cannot be substantially improved without AVX2 that gives the same result for QWord
s as !376 (merged) for DWord
s), but x64
System
doesn’t detect AVX2 for you like i386
detects SSE (would I be able to do uses cpu;
with #40114?).
Also, I wasn’t in the mood to reverse IndexWord
algorithm (like I did with IndexByte
; or, rather, reverse its unaligned part) and just transpiled (by hand) the x86-64 version, maybe I’ll redo it later more consciously. Even current version, stupid and scary, is many times faster though (the gap would be smaller with !375 (merged)...).
Benchmark: IndexCompareBenchmarkI386.pas.
My results ✨
plain SSE2
IndexByte(#0 .. #1): 2.5 ns/call 2.5 ns/call
IndexByte(#0 .. #14): 3.9 ns/call 2.5 ns/call
IndexByte(#0 .. #15): 3.6 ns/call 2.5 ns/call
IndexByte(#0 .. #16): 3.6 ns/call 2.6 ns/call
IndexByte(#0 .. #49): 5.7 ns/call 3.2 ns/call
IndexByte(#0 .. #99): 9.0 ns/call 4.1 ns/call
IndexByte(#373 .. #626): 78 ns/call 25 ns/call
IndexWord(#0 .. #1): 16 ns/call 3.2 ns/call
IndexWord(#0 .. #14): 19 ns/call 3.4 ns/call
IndexWord(#0 .. #15): 19 ns/call 3.4 ns/call
IndexWord(#0 .. #16): 19 ns/call 3.5 ns/call
IndexWord(#0 .. #49): 27 ns/call 5.7 ns/call
IndexWord(#0 .. #99): 39 ns/call 11 ns/call
IndexWord(#0 .. #999): 257 ns/call 39 ns/call
IndexDWord(#0 .. #1): 16 ns/call 2.9 ns/call
IndexDWord(#0 .. #14): 18 ns/call 3.0 ns/call
IndexDWord(#0 .. #15): 19 ns/call 2.9 ns/call
IndexDWord(#0 .. #16): 19 ns/call 3.0 ns/call
IndexDWord(#0 .. #49): 27 ns/call 4.6 ns/call
IndexDWord(#0 .. #99): 39 ns/call 8.8 ns/call
IndexDWord(#0 .. #999): 258 ns/call 60 ns/call
CompareByte(#0 / 1): 2.1 ns/call 2.1 ns/call
CompareByte(#6 / 7): 4.7 ns/call 2.5 ns/call
CompareByte(#14 / 15): 6.2 ns/call 2.5 ns/call
CompareByte(#30 / 31): 7.2 ns/call 3.1 ns/call
CompareByte(#1 / 100): 2.3 ns/call 2.4 ns/call
CompareByte(#99 / 100): 11 ns/call 6.6 ns/call
CompareByte(#199 / 200): 18 ns/call 10 ns/call
CompareByte(#999 / 1000): 81 ns/call 38 ns/call
CompareByte(#9999 / 10000): 700 ns/call 397 ns/call
CompareWord(#0 / 1): 2.0 ns/call 2.8 ns/call
CompareWord(#6 / 7): 3.1 ns/call 2.8 ns/call
CompareWord(#14 / 15): 4.8 ns/call 3.5 ns/call
CompareWord(#30 / 31): 8.4 ns/call 5.1 ns/call
CompareWord(#1 / 100): 2.3 ns/call 2.6 ns/call
CompareWord(#99 / 100): 6.5 ns/call 4.8 ns/call
CompareWord(#199 / 200): 40 ns/call 22 ns/call
CompareWord(#999 / 1000): 140 ns/call 102 ns/call
CompareWord(#9999 / 10000): 601 ns/call 420 ns/call
CompareDWord(#0 / 1): 2.0 ns/call 2.0 ns/call
CompareDWord(#6 / 7): 5.2 ns/call 3.9 ns/call
CompareDWord(#14 / 15): 6.8 ns/call 3.9 ns/call
CompareDWord(#30 / 31): 14 ns/call 6.5 ns/call
CompareDWord(#1 / 100): 2.3 ns/call 2.5 ns/call
CompareDWord(#99 / 100): 5.9 ns/call 3.3 ns/call
CompareDWord(#199 / 200): 63 ns/call 23 ns/call
CompareDWord(#999 / 1000): 494 ns/call 196 ns/call
CompareDWord(#9999 / 10000): 1867 ns/call 719 ns/call
C++ sources 😱 🩸⚠ ️🚫 ☣ ️☢ ️
(don’t laugh about IndexWord
, I needed a proof of concept to see if the entire thing is worth it
#include <immintrin.h>
#include <cstdint>
#include <limits>
using namespace std;
const uintptr_t XMM_MASK = 16 - 1;
const uintptr_t PAGE_SIZE = 4096;
__attribute__((regparm(3)))
ptrdiff_t compare_byte(uint8_t* a, uint8_t* b, ptrdiff_t n)
{
uint8_t *ae = a + n, *aepart;
if (n >= 4)
{
aepart = a + (n & ~XMM_MASK);
while (a != aepart)
{
__m128i sample_a = _mm_loadu_si128((__m128i*)a);
__m128i sample_b = _mm_loadu_si128((__m128i*)b);
int cmp_mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(sample_a, sample_b)) & 65535;
if (cmp_mask != 0)
{
size_t idiff = __builtin_ctz(cmp_mask);
return (ptrdiff_t)a[idiff] - (ptrdiff_t)b[idiff];
}
a += 1 + XMM_MASK, b += 1 + XMM_MASK;
}
if (a == ae) return 0;
// If both tails can be over-read to XMMs, compare them as XMMs.
// (a ^ (a + b)) >= PageSize iff 'a + b' resides on different page than 'a'.
if
(
((uintptr_t)a ^ ((uintptr_t)a + XMM_MASK)) < PAGE_SIZE &&
((uintptr_t)b ^ ((uintptr_t)b + XMM_MASK)) < PAGE_SIZE
)
{
__m128i sample_a = _mm_loadu_si128((__m128i*)a);
__m128i sample_b = _mm_loadu_si128((__m128i*)b);
int cmp_mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(sample_a, sample_b)) & 65535;
if (cmp_mask == 0) return 0;
size_t idiff = __builtin_ctz(cmp_mask);
if (a + idiff >= ae) return 0;
return (ptrdiff_t)a[idiff] - (ptrdiff_t)b[idiff];
}
// Compare uint32 by uint32, can be replaced with 'goto bytewise_body'.
// GCC "smartly" sees that ae - a == (n & 15) and "cleverly" calculates 'a + ((ae - a) & -4)' as 'a + (n & 12)'.
// This makes the entire function use up a third nonvolatile register to preserve 'n' until here.
// Force it to calculate '(ae - a) & -4' directly.
asm volatile("" : "+rm" (ae) ::);
aepart = a + ((ae - a) & ~uintptr_t{sizeof(uint32_t) - 1});
while (a != aepart)
{
if (*(uint32_t*)a != *(uint32_t*)b)
return 2 * (__builtin_bswap32(*(uint32_t*)a) > __builtin_bswap32(*(volatile uint32_t*)b)) - 1;
a += sizeof(uint32_t); b += sizeof(uint32_t);
}
}
while (a != ae)
{ // bytewise_body:
if (*a != *b) return (ptrdiff_t)*a - (ptrdiff_t)*(volatile uint8_t*)b;
a++, b++;
}
return 0;
}
__attribute__((regparm(3)))
ptrdiff_t compare_word(uint16_t* a, uint16_t* b, ptrdiff_t n)
{
uint16_t *ae = a + n, *aepart;
if (n < 0 || n > numeric_limits<ptrdiff_t>::max() / 2)
{
ae = a;
goto wordwise_body;
}
if (n >= 4)
{
aepart = a + (n & -ptrdiff_t{8});
while (a != aepart)
{
__m128i sample_a = _mm_loadu_si128((__m128i*)a);
__m128i sample_b = _mm_loadu_si128((__m128i*)b);
int cmp_mask = ~_mm_movemask_epi8(_mm_cmpeq_epi16(sample_a, sample_b)) & 65535;
if (cmp_mask != 0)
{
size_t idiff = __builtin_ctz(cmp_mask);
return 2 * (*(uint16_t*)((uint8_t*)a + idiff) > *(uint16_t*)((uint8_t*)b + idiff)) - 1;
}
a += 8, b += 8;
}
if (a == ae) return 0;
if
(
((uintptr_t)a ^ ((uintptr_t)a + XMM_MASK)) < PAGE_SIZE &&
((uintptr_t)b ^ ((uintptr_t)b + XMM_MASK)) < PAGE_SIZE
)
{
__m128i sample_a = _mm_loadu_si128((__m128i*)a);
__m128i sample_b = _mm_loadu_si128((__m128i*)b);
int cmp_mask = ~_mm_movemask_epi8(_mm_cmpeq_epi16(sample_a, sample_b)) & 65535;
if (cmp_mask == 0) return 0;
size_t idiff = __builtin_ctz(cmp_mask);
if ((uintptr_t)a + idiff >= (uintptr_t)ae) return 0;
return 2 * (*(uint16_t*)((uint8_t*)a + idiff) > *(uint16_t*)((uint8_t*)b + idiff)) - 1;
}
goto wordwise_body;
}
while (a != ae)
{ wordwise_body:
if (*a != *b) return 2 * (*a > *b) - 1;
a++, b++;
}
return 0;
}
__attribute__((regparm(3)))
ptrdiff_t compare_dword(uint32_t* a, uint32_t* b, ptrdiff_t n)
{
uint32_t *ae = a + n, *aepart;
if (n < 0 || n > numeric_limits<ptrdiff_t>::max() / 4)
{
ae = a;
goto dwordwise_body;
}
if (n >= 4)
{
aepart = a + (n & -ptrdiff_t{4});
do
{
__m128i sample_a = _mm_loadu_si128((__m128i*)a);
__m128i sample_b = _mm_loadu_si128((__m128i*)b);
int cmp_mask = ~_mm_movemask_epi8(_mm_cmpeq_epi32(sample_a, sample_b)) & 65535;
if (cmp_mask != 0)
{
size_t idiff = __builtin_ctz(cmp_mask);
return 2 * (*(uint32_t*)((uint8_t*)a + idiff) > *(uint32_t*)((uint8_t*)b + idiff)) - 1;
}
a += 4, b += 4;
} while (a != aepart);
}
while (a != ae)
{ dwordwise_body:
if (*a != *b) return 2 * (*a > *b) - 1;
a++, b++;
}
return 0;
}
__attribute__((regparm(3)))
ptrdiff_t index_byte(uint8_t* buf, ptrdiff_t len, uint8_t v)
{
uint8_t *aligned_buf = (uint8_t*)((uintptr_t)buf & ~XMM_MASK);
if (len == 0) return -1;
__m128i ref = _mm_set1_epi8(v);
__m128i sample = *(__m128i*)aligned_buf;
ptrdiff_t buf_pos = aligned_buf + 16 - buf;
int cmp_mask = ((_mm_movemask_epi8(_mm_cmpeq_epi8(ref, sample)) << buf_pos) & 0xFFFF0000) >> buf_pos;
buf_pos -= 16;
goto test_mask;
while ((uintptr_t)buf_pos < (uintptr_t)len)
{
sample = *(__m128i*)(buf + buf_pos);
cmp_mask = _mm_movemask_epi8(_mm_cmpeq_epi8(ref, sample));
test_mask:
if (cmp_mask != 0)
{
buf_pos += __builtin_ctz(cmp_mask);
if ((uintptr_t)buf_pos >= (uintptr_t)len) return -1;
return buf_pos;
}
buf_pos += 16;
}
return -1;
}
__attribute__((regparm(3)))
ptrdiff_t index_word(uint16_t* buf, ptrdiff_t len, uint16_t v)
{
__m128i xmm0, xmm1 = _mm_cvtsi32_si128(v), xmm2;
uint32_t ecx = (uint32_t)buf;
uint32_t edx = len;
uint32_t r8d = ecx, r10d;
uint32_t eax;
xmm1 = _mm_unpacklo_epi16(xmm1, xmm1);
ecx &= -ptrdiff_t{0x10};
if (edx == 0) goto notfound;
xmm1 = _mm_shuffle_epi32(xmm1, 0);
ecx += 16;
xmm0 = *(__m128i*)(ecx - 16);
ecx -= r8d;
if (r8d & 1) goto unaligned;
eax = _mm_movemask_epi8(_mm_cmpeq_epi16(xmm0, xmm1));
eax <<= (uint8_t)ecx;
eax &= 0xFFFF0000;
eax >>= (uint8_t)ecx;
ecx >>= 1;
goto cont;
loop:
xmm0 = *(__m128i*)(r8d + 2 * ecx);
ecx += 8;
eax = _mm_movemask_epi8(_mm_cmpeq_epi16(xmm0, xmm1));
cont:
if (eax) goto match;
if (edx > ecx) goto loop;
notfound:
return -1;
match:
eax = ((uint32_t)__builtin_ctz(eax) >> 1) - 8 + ecx;
if (edx <= eax) return -1;
return eax;
unaligned:
xmm2 = xmm1;
xmm1 = _mm_slli_epi16(xmm1, 8);
xmm2 = _mm_srli_epi16(xmm2, 8);
xmm1 = _mm_or_si128(xmm1, xmm2);
eax = _mm_movemask_epi8(_mm_cmpeq_epi8(xmm0, xmm1));
eax <<= (uint8_t)ecx;
eax &= 0xFFFF0000;
eax >>= (uint8_t)ecx;
edx += edx;
r10d = 0;
goto cont_u;
loop_u:
xmm0 = *(__m128i*)(ecx + r8d);
ecx += 16;
eax = _mm_movemask_epi8(_mm_cmpeq_epi8(xmm0, xmm1));
r10d >>= 16;
cont_u:
eax <<= 1;
eax |= r10d;
r10d = eax;
eax >>= 1;
eax &= r10d;
eax &= 0x5555;
if (eax) goto match_u;
if (edx > ecx) goto loop_u;
return -1;
match_u:
eax = (uint32_t)__builtin_ctz(eax) - 16 + ecx;
if (edx <= eax) return -1;
return eax >> 1;
}
__attribute__((regparm(3)))
ptrdiff_t index_dword(uint32_t* buf, ptrdiff_t len, uint32_t v)
{
uint32_t *bufp = buf, *bufe = buf + len, *bufepart;
if (len < 0 || len > numeric_limits<ptrdiff_t>::max() / 4)
{
bufe = buf;
goto dwordwise_body;
}
bufepart = bufp + (len & -ptrdiff_t{4});
if (bufepart != bufp)
{
__m128i ref = _mm_set1_epi32(v);
bufepart = buf + (len & -ptrdiff_t{4});
do
{
__m128i sample = _mm_loadu_si128((__m128i*)bufp);
int cmp_mask = _mm_movemask_epi8(_mm_cmpeq_epi32(sample, ref));
if (cmp_mask != 0)
return ((uintptr_t)bufp - (uintptr_t)buf + __builtin_ctz(cmp_mask)) / 4;
bufp += 4;
} while (bufp != bufepart);
}
while (bufp != bufe)
{ dwordwise_body:
if (*bufp == v) return ((uintptr_t)bufp - (uintptr_t)buf) / 4;
bufp++;
}
return -1;
}