Skip to content

SSE2 Index* and Compare* for i386.

Rika requested to merge runewalsh/source:ic86 into main

Use function pointers that at first point at dispatching functions that analyze CPU features and change the pointer to a more appropriate version for the rest of the eternity. Further tweak by inlining original functions under {$if defined(cpui386)} (so that if other platforms decide to do the same, they could extend the condition). This is a trick I’ve been using for my batch processing or mathematical functions as of late, to great results like 10–15× faster matrix inverse. :’D

I’d want an AVX2 IndexQWord for x64 (I like searching pointers a lot... and maybe certain other functions, but the special thing about IndexQWord is that it cannot be substantially improved without AVX2 that gives the same result for QWords as !376 (merged) for DWords), but x64 System doesn’t detect AVX2 for you like i386 detects SSE (would I be able to do uses cpu; with #40114?).

Also, I wasn’t in the mood to reverse IndexWord algorithm (like I did with IndexByte; or, rather, reverse its unaligned part) and just transpiled (by hand) the x86-64 version, maybe I’ll redo it later more consciously. Even current version, stupid and scary, is many times faster though (the gap would be smaller with !375 (merged)...).

Benchmark: IndexCompareBenchmarkI386.pas.

My results
                             plain            SSE2
IndexByte(#0 .. #1):         2.5 ns/call      2.5 ns/call
IndexByte(#0 .. #14):        3.9 ns/call      2.5 ns/call
IndexByte(#0 .. #15):        3.6 ns/call      2.5 ns/call
IndexByte(#0 .. #16):        3.6 ns/call      2.6 ns/call
IndexByte(#0 .. #49):        5.7 ns/call      3.2 ns/call
IndexByte(#0 .. #99):        9.0 ns/call      4.1 ns/call
IndexByte(#373 .. #626):     78 ns/call       25 ns/call

IndexWord(#0 .. #1):         16 ns/call       3.2 ns/call
IndexWord(#0 .. #14):        19 ns/call       3.4 ns/call
IndexWord(#0 .. #15):        19 ns/call       3.4 ns/call
IndexWord(#0 .. #16):        19 ns/call       3.5 ns/call
IndexWord(#0 .. #49):        27 ns/call       5.7 ns/call
IndexWord(#0 .. #99):        39 ns/call       11 ns/call
IndexWord(#0 .. #999):       257 ns/call      39 ns/call
                                              
IndexDWord(#0 .. #1):        16 ns/call       2.9 ns/call
IndexDWord(#0 .. #14):       18 ns/call       3.0 ns/call
IndexDWord(#0 .. #15):       19 ns/call       2.9 ns/call
IndexDWord(#0 .. #16):       19 ns/call       3.0 ns/call
IndexDWord(#0 .. #49):       27 ns/call       4.6 ns/call
IndexDWord(#0 .. #99):       39 ns/call       8.8 ns/call
IndexDWord(#0 .. #999):      258 ns/call      60 ns/call

CompareByte(#0 / 1):         2.1 ns/call      2.1 ns/call
CompareByte(#6 / 7):         4.7 ns/call      2.5 ns/call
CompareByte(#14 / 15):       6.2 ns/call      2.5 ns/call
CompareByte(#30 / 31):       7.2 ns/call      3.1 ns/call
CompareByte(#1 / 100):       2.3 ns/call      2.4 ns/call
CompareByte(#99 / 100):      11 ns/call       6.6 ns/call
CompareByte(#199 / 200):     18 ns/call       10 ns/call
CompareByte(#999 / 1000):    81 ns/call       38 ns/call
CompareByte(#9999 / 10000):  700 ns/call      397 ns/call
                                              
CompareWord(#0 / 1):         2.0 ns/call      2.8 ns/call
CompareWord(#6 / 7):         3.1 ns/call      2.8 ns/call
CompareWord(#14 / 15):       4.8 ns/call      3.5 ns/call
CompareWord(#30 / 31):       8.4 ns/call      5.1 ns/call
CompareWord(#1 / 100):       2.3 ns/call      2.6 ns/call
CompareWord(#99 / 100):      6.5 ns/call      4.8 ns/call
CompareWord(#199 / 200):     40 ns/call       22 ns/call
CompareWord(#999 / 1000):    140 ns/call      102 ns/call
CompareWord(#9999 / 10000):  601 ns/call      420 ns/call
                                              
CompareDWord(#0 / 1):        2.0 ns/call      2.0 ns/call
CompareDWord(#6 / 7):        5.2 ns/call      3.9 ns/call
CompareDWord(#14 / 15):      6.8 ns/call      3.9 ns/call
CompareDWord(#30 / 31):      14 ns/call       6.5 ns/call
CompareDWord(#1 / 100):      2.3 ns/call      2.5 ns/call
CompareDWord(#99 / 100):     5.9 ns/call      3.3 ns/call
CompareDWord(#199 / 200):    63 ns/call       23 ns/call
CompareDWord(#999 / 1000):   494 ns/call      196 ns/call
CompareDWord(#9999 / 10000): 1867 ns/call     719 ns/call
C++ sources 😱🩸🚫

(don’t laugh about IndexWord, I needed a proof of concept to see if the entire thing is worth it 😭)

#include <immintrin.h>
#include <cstdint>
#include <limits>
using namespace std;

const uintptr_t XMM_MASK = 16 - 1;
const uintptr_t PAGE_SIZE = 4096;

__attribute__((regparm(3)))
ptrdiff_t compare_byte(uint8_t* a, uint8_t* b, ptrdiff_t n)
{
	uint8_t *ae = a + n, *aepart;
	if (n >= 4)
	{
		aepart = a + (n & ~XMM_MASK);
		while (a != aepart)
		{
			__m128i sample_a = _mm_loadu_si128((__m128i*)a);
			__m128i sample_b = _mm_loadu_si128((__m128i*)b);
			int cmp_mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(sample_a, sample_b)) & 65535;
			if (cmp_mask != 0)
			{
				size_t idiff = __builtin_ctz(cmp_mask);
				return (ptrdiff_t)a[idiff] - (ptrdiff_t)b[idiff];
			}
			a += 1 + XMM_MASK, b += 1 + XMM_MASK;
		}
		if (a == ae) return 0;

		// If both tails can be over-read to XMMs, compare them as XMMs.
		// (a ^ (a + b)) >= PageSize iff 'a + b' resides on different page than 'a'.
		if
		(
			((uintptr_t)a ^ ((uintptr_t)a + XMM_MASK)) < PAGE_SIZE &&
			((uintptr_t)b ^ ((uintptr_t)b + XMM_MASK)) < PAGE_SIZE
		)
		{
			__m128i sample_a = _mm_loadu_si128((__m128i*)a);
			__m128i sample_b = _mm_loadu_si128((__m128i*)b);
			int cmp_mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(sample_a, sample_b)) & 65535;
			if (cmp_mask == 0) return 0;
			size_t idiff = __builtin_ctz(cmp_mask);
			if (a + idiff >= ae) return 0;
			return (ptrdiff_t)a[idiff] - (ptrdiff_t)b[idiff];
		}

		// Compare uint32 by uint32, can be replaced with 'goto bytewise_body'.

		// GCC "smartly" sees that ae - a == (n & 15) and "cleverly" calculates 'a + ((ae - a) & -4)' as 'a + (n & 12)'.
		// This makes the entire function use up a third nonvolatile register to preserve 'n' until here.
		// Force it to calculate '(ae - a) & -4' directly.
		asm volatile("" : "+rm" (ae) ::);

		aepart = a + ((ae - a) & ~uintptr_t{sizeof(uint32_t) - 1});
		while (a != aepart)
		{
			if (*(uint32_t*)a != *(uint32_t*)b)
				return 2 * (__builtin_bswap32(*(uint32_t*)a) > __builtin_bswap32(*(volatile uint32_t*)b)) - 1;
			a += sizeof(uint32_t); b += sizeof(uint32_t);
		}
	}
	while (a != ae)
	{ // bytewise_body:
		if (*a != *b) return (ptrdiff_t)*a - (ptrdiff_t)*(volatile uint8_t*)b;
		a++, b++;
	}
	return 0;
}

__attribute__((regparm(3)))
ptrdiff_t compare_word(uint16_t* a, uint16_t* b, ptrdiff_t n)
{
	uint16_t *ae = a + n, *aepart;
	if (n < 0 || n > numeric_limits<ptrdiff_t>::max() / 2)
	{
		ae = a;
		goto wordwise_body;
	}
	if (n >= 4)
	{
		aepart = a + (n & -ptrdiff_t{8});
		while (a != aepart)
		{
			__m128i sample_a = _mm_loadu_si128((__m128i*)a);
			__m128i sample_b = _mm_loadu_si128((__m128i*)b);
			int cmp_mask = ~_mm_movemask_epi8(_mm_cmpeq_epi16(sample_a, sample_b)) & 65535;
			if (cmp_mask != 0)
			{
				size_t idiff = __builtin_ctz(cmp_mask);
				return 2 * (*(uint16_t*)((uint8_t*)a + idiff) > *(uint16_t*)((uint8_t*)b + idiff)) - 1;
			}
			a += 8, b += 8;
		}
		if (a == ae) return 0;

		if
		(
			((uintptr_t)a ^ ((uintptr_t)a + XMM_MASK)) < PAGE_SIZE &&
			((uintptr_t)b ^ ((uintptr_t)b + XMM_MASK)) < PAGE_SIZE
		)
		{
			__m128i sample_a = _mm_loadu_si128((__m128i*)a);
			__m128i sample_b = _mm_loadu_si128((__m128i*)b);
			int cmp_mask = ~_mm_movemask_epi8(_mm_cmpeq_epi16(sample_a, sample_b)) & 65535;
			if (cmp_mask == 0) return 0;
			size_t idiff = __builtin_ctz(cmp_mask);
			if ((uintptr_t)a + idiff >= (uintptr_t)ae) return 0;
			return 2 * (*(uint16_t*)((uint8_t*)a + idiff) > *(uint16_t*)((uint8_t*)b + idiff)) - 1;
		}
		goto wordwise_body;
	}
	while (a != ae)
	{ wordwise_body:
		if (*a != *b) return 2 * (*a > *b) - 1;
		a++, b++;
	}
	return 0;
}

__attribute__((regparm(3)))
ptrdiff_t compare_dword(uint32_t* a, uint32_t* b, ptrdiff_t n)
{
	uint32_t *ae = a + n, *aepart;
	if (n < 0 || n > numeric_limits<ptrdiff_t>::max() / 4)
	{
		ae = a;
		goto dwordwise_body;
	}
	if (n >= 4)
	{
		aepart = a + (n & -ptrdiff_t{4});
		do
		{
			__m128i sample_a = _mm_loadu_si128((__m128i*)a);
			__m128i sample_b = _mm_loadu_si128((__m128i*)b);
			int cmp_mask = ~_mm_movemask_epi8(_mm_cmpeq_epi32(sample_a, sample_b)) & 65535;
			if (cmp_mask != 0)
			{
				size_t idiff = __builtin_ctz(cmp_mask);
				return 2 * (*(uint32_t*)((uint8_t*)a + idiff) > *(uint32_t*)((uint8_t*)b + idiff)) - 1;
			}
			a += 4, b += 4;
		} while (a != aepart);
	}
	while (a != ae)
	{ dwordwise_body:
		if (*a != *b) return 2 * (*a > *b) - 1;
		a++, b++;
	}
	return 0;
}

__attribute__((regparm(3)))
ptrdiff_t index_byte(uint8_t* buf, ptrdiff_t len, uint8_t v)
{
	uint8_t *aligned_buf = (uint8_t*)((uintptr_t)buf & ~XMM_MASK);
	if (len == 0) return -1;
	__m128i ref = _mm_set1_epi8(v);
	__m128i sample = *(__m128i*)aligned_buf;
	ptrdiff_t buf_pos = aligned_buf + 16 - buf;

	int cmp_mask = ((_mm_movemask_epi8(_mm_cmpeq_epi8(ref, sample)) << buf_pos) & 0xFFFF0000) >> buf_pos;
	buf_pos -= 16;
	goto test_mask;

	while ((uintptr_t)buf_pos < (uintptr_t)len)
	{
		sample = *(__m128i*)(buf + buf_pos);
		cmp_mask = _mm_movemask_epi8(_mm_cmpeq_epi8(ref, sample));
	test_mask:
		if (cmp_mask != 0)
		{
			buf_pos += __builtin_ctz(cmp_mask);
			if ((uintptr_t)buf_pos >= (uintptr_t)len) return -1;
			return buf_pos;
		}
		buf_pos += 16;
	}

	return -1;
}

__attribute__((regparm(3)))
ptrdiff_t index_word(uint16_t* buf, ptrdiff_t len, uint16_t v)
{
	__m128i xmm0, xmm1 = _mm_cvtsi32_si128(v), xmm2;
	uint32_t ecx = (uint32_t)buf;
	uint32_t edx = len;
	uint32_t r8d = ecx, r10d;
	uint32_t eax;
	xmm1 = _mm_unpacklo_epi16(xmm1, xmm1);
	ecx &= -ptrdiff_t{0x10};
	if (edx == 0) goto notfound;
	xmm1 = _mm_shuffle_epi32(xmm1, 0);
	ecx += 16;
	xmm0 = *(__m128i*)(ecx - 16);
	ecx -= r8d;

	if (r8d & 1) goto unaligned;
	eax = _mm_movemask_epi8(_mm_cmpeq_epi16(xmm0, xmm1));
	eax <<= (uint8_t)ecx;
	eax &= 0xFFFF0000;
	eax >>= (uint8_t)ecx;
	ecx >>= 1;
	goto cont;

loop:
	xmm0 = *(__m128i*)(r8d + 2 * ecx);
	ecx += 8;
	eax = _mm_movemask_epi8(_mm_cmpeq_epi16(xmm0, xmm1));
cont:
	if (eax) goto match;
	if (edx > ecx) goto loop;

notfound:
	return -1;

match:
	eax = ((uint32_t)__builtin_ctz(eax) >> 1) - 8 + ecx;
	if (edx <= eax) return -1;
	return eax;

unaligned:
	xmm2 = xmm1;
	xmm1 = _mm_slli_epi16(xmm1, 8);
	xmm2 = _mm_srli_epi16(xmm2, 8);
	xmm1 = _mm_or_si128(xmm1, xmm2);
	eax = _mm_movemask_epi8(_mm_cmpeq_epi8(xmm0, xmm1));
	eax <<= (uint8_t)ecx;
	eax &= 0xFFFF0000;
	eax >>= (uint8_t)ecx;
	
	edx += edx;
	r10d = 0;
	goto cont_u;

loop_u:
	xmm0 = *(__m128i*)(ecx + r8d);
	ecx += 16;
	eax = _mm_movemask_epi8(_mm_cmpeq_epi8(xmm0, xmm1));
	r10d >>= 16;
cont_u:
	eax <<= 1;
	eax |= r10d;
	r10d = eax;
	eax >>= 1;
	eax &= r10d;
	eax &= 0x5555;
	if (eax) goto match_u;
	if (edx > ecx) goto loop_u;
	return -1;

match_u:
	eax = (uint32_t)__builtin_ctz(eax) - 16 + ecx;
	if (edx <= eax) return -1;
	return eax >> 1;
}

__attribute__((regparm(3)))
ptrdiff_t index_dword(uint32_t* buf, ptrdiff_t len, uint32_t v)
{
	uint32_t *bufp = buf, *bufe = buf + len, *bufepart;
	if (len < 0 || len > numeric_limits<ptrdiff_t>::max() / 4)
	{
		bufe = buf;
		goto dwordwise_body;
	}
	bufepart = bufp + (len & -ptrdiff_t{4});
	if (bufepart != bufp)
	{
		__m128i ref = _mm_set1_epi32(v);
		bufepart = buf + (len & -ptrdiff_t{4});
		do
		{
			__m128i sample = _mm_loadu_si128((__m128i*)bufp);
			int cmp_mask = _mm_movemask_epi8(_mm_cmpeq_epi32(sample, ref));
			if (cmp_mask != 0)
				return ((uintptr_t)bufp - (uintptr_t)buf + __builtin_ctz(cmp_mask)) / 4;
			bufp += 4;
		} while (bufp != bufepart);
	}
	while (bufp != bufe)
	{ dwordwise_body:
		if (*bufp == v) return ((uintptr_t)bufp - (uintptr_t)buf) / 4;
		bufp++;
	}
	return -1;
}
Edited by Rika

Merge request reports