Possibly incorrect emulation of ARM SME svld1_hor_vnum/svst1_hor_vnum
Here is a standalone reproducer that shows the issue:
__arm_new("za") __arm_locally_streaming void test() {
const size_t svl = svcntsw();
svbool_t p = svptrue_b32();
std::vector<int> c(svl * 4);
for (size_t i = 0; i < svl * 4; ++i) {
c[i] = i;
}
#if 0
// This is how Apple M4 and ARM spec thinks it should work:
// (https://arm-software.github.io/acle/main/acle.html#ld1b-ld1h-ld1w-ld1d-ld1q)
svld1_hor_vnum_za32(/*tile=*/0, /*slice=*/0, p, &c[0], 0);
svld1_hor_vnum_za32(/*tile=*/0, /*slice=*/0, p, &c[0], 1);
svld1_hor_vnum_za32(/*tile=*/0, /*slice=*/0, p, &c[0], 2);
svld1_hor_vnum_za32(/*tile=*/0, /*slice=*/0, p, &c[0], 3);
#elif 1
// This is how qemu thinks it should work:
svld1_hor_vnum_za32(/*tile=*/0, /*slice=*/0, p, &c[0], 0);
svld1_hor_vnum_za32(/*tile=*/0, /*slice=*/1, p, &c[0], 1);
svld1_hor_vnum_za32(/*tile=*/0, /*slice=*/2, p, &c[0], 2);
svld1_hor_vnum_za32(/*tile=*/0, /*slice=*/3, p, &c[0], 3);
#else
// Expected equivalent:
svld1_hor_za32(/*tile=*/0, /*slice=*/0, p, &c[0 * svl]);
svld1_hor_za32(/*tile=*/0, /*slice=*/1, p, &c[1 * svl]);
svld1_hor_za32(/*tile=*/0, /*slice=*/2, p, &c[2 * svl]);
svld1_hor_za32(/*tile=*/0, /*slice=*/3, p, &c[3 * svl]);
#endif
std::vector<int> c_out(svl * 4);
#if 0
// This is how Apple M4 and ARM spec thinks it should work:
svst1_hor_vnum_za32(/*tile=*/0, /*slice=*/0, p, &c_out[0], 0);
svst1_hor_vnum_za32(/*tile=*/0, /*slice=*/0, p, &c_out[0], 1);
svst1_hor_vnum_za32(/*tile=*/0, /*slice=*/0, p, &c_out[0], 2);
svst1_hor_vnum_za32(/*tile=*/0, /*slice=*/0, p, &c_out[0], 3);
#elif 1
// This is how qemu thinks it should work:
svst1_hor_vnum_za32(/*tile=*/0, /*slice=*/0, p, &c_out[0], 0);
svst1_hor_vnum_za32(/*tile=*/0, /*slice=*/1, p, &c_out[0], 1);
svst1_hor_vnum_za32(/*tile=*/0, /*slice=*/2, p, &c_out[0], 2);
svst1_hor_vnum_za32(/*tile=*/0, /*slice=*/3, p, &c_out[0], 3);
#else
// Expected equivalent:
svst1_hor_za32(/*tile=*/0, /*slice=*/0, p, &c_out[0 * svl]);
svst1_hor_za32(/*tile=*/0, /*slice=*/1, p, &c_out[1 * svl]);
svst1_hor_za32(/*tile=*/0, /*slice=*/2, p, &c_out[2 * svl]);
svst1_hor_za32(/*tile=*/0, /*slice=*/3, p, &c_out[3 * svl]);
#endif
if (c == c_out) {
std::cout << "PASS!" << std::endl;
} else {
for (size_t i = 0; i < svl * 4; ++i) {
std::cout << i << ": " << c[i] << " " << c_out[i] << std::endl;
}
}
}
There are other instructions that may have the same issue (vertical variants, and different types).