Possibly incorrect emulation of ARM SME svld1_hor_vnum/svst1_hor_vnum

Here is a standalone reproducer that shows the issue:


__arm_new("za") __arm_locally_streaming void test() {
  const size_t svl = svcntsw();
  svbool_t p = svptrue_b32();
  std::vector<int> c(svl * 4);
  for (size_t i = 0; i < svl * 4; ++i) {
    c[i] = i;
  }
#if 0
  // This is how Apple M4 and ARM spec thinks it should work:
  // (https://arm-software.github.io/acle/main/acle.html#ld1b-ld1h-ld1w-ld1d-ld1q)
  svld1_hor_vnum_za32(/*tile=*/0, /*slice=*/0, p, &c[0], 0);
  svld1_hor_vnum_za32(/*tile=*/0, /*slice=*/0, p, &c[0], 1);
  svld1_hor_vnum_za32(/*tile=*/0, /*slice=*/0, p, &c[0], 2);
  svld1_hor_vnum_za32(/*tile=*/0, /*slice=*/0, p, &c[0], 3);
#elif 1
  // This is how qemu thinks it should work:
  svld1_hor_vnum_za32(/*tile=*/0, /*slice=*/0, p, &c[0], 0);
  svld1_hor_vnum_za32(/*tile=*/0, /*slice=*/1, p, &c[0], 1);
  svld1_hor_vnum_za32(/*tile=*/0, /*slice=*/2, p, &c[0], 2);
  svld1_hor_vnum_za32(/*tile=*/0, /*slice=*/3, p, &c[0], 3);
#else
  // Expected equivalent:
  svld1_hor_za32(/*tile=*/0, /*slice=*/0, p, &c[0 * svl]);
  svld1_hor_za32(/*tile=*/0, /*slice=*/1, p, &c[1 * svl]);
  svld1_hor_za32(/*tile=*/0, /*slice=*/2, p, &c[2 * svl]);
  svld1_hor_za32(/*tile=*/0, /*slice=*/3, p, &c[3 * svl]);
#endif

  std::vector<int> c_out(svl * 4);
#if 0
  // This is how Apple M4 and ARM spec thinks it should work:
  svst1_hor_vnum_za32(/*tile=*/0, /*slice=*/0, p, &c_out[0], 0);
  svst1_hor_vnum_za32(/*tile=*/0, /*slice=*/0, p, &c_out[0], 1);
  svst1_hor_vnum_za32(/*tile=*/0, /*slice=*/0, p, &c_out[0], 2);
  svst1_hor_vnum_za32(/*tile=*/0, /*slice=*/0, p, &c_out[0], 3);
#elif 1
  // This is how qemu thinks it should work:
  svst1_hor_vnum_za32(/*tile=*/0, /*slice=*/0, p, &c_out[0], 0);
  svst1_hor_vnum_za32(/*tile=*/0, /*slice=*/1, p, &c_out[0], 1);
  svst1_hor_vnum_za32(/*tile=*/0, /*slice=*/2, p, &c_out[0], 2);
  svst1_hor_vnum_za32(/*tile=*/0, /*slice=*/3, p, &c_out[0], 3);
#else
  // Expected equivalent:
  svst1_hor_za32(/*tile=*/0, /*slice=*/0, p, &c_out[0 * svl]);
  svst1_hor_za32(/*tile=*/0, /*slice=*/1, p, &c_out[1 * svl]);
  svst1_hor_za32(/*tile=*/0, /*slice=*/2, p, &c_out[2 * svl]);
  svst1_hor_za32(/*tile=*/0, /*slice=*/3, p, &c_out[3 * svl]);
#endif
  if (c == c_out) {
    std::cout << "PASS!" << std::endl;
  } else {
    for (size_t i = 0; i < svl * 4; ++i) {
      std::cout << i << ": " << c[i] << " " << c_out[i] << std::endl;
    }
  }
}

There are other instructions that may have the same issue (vertical variants, and different types).