Compare revisions

Salome Thirot · Hassene Tmar · Salome Thirot · Hassene Tmar · Salome Thirot · Hassene Tmar
--- a/Source/Lib/ASM_NEON/CMakeLists.txt
+++ b/Source/Lib/ASM_NEON/CMakeLists.txt
@@ -58,6 +58,7 @@ target_sources(
  PUBLIC pack_unpack_intrin_neon.c
  PUBLIC pickrst_neon.c
  PUBLIC picture_operators_intrinsic_neon.c
+  PUBLIC restoration_pick_neon.c
  PUBLIC sad_neon.c
  PUBLIC selfguided_neon.c
  PUBLIC sse_neon.c

--- a/Source/Lib/ASM_NEON/cfl_neon.c
+++ b/Source/Lib/ASM_NEON/cfl_neon.c
@@ -9,7 +9,9 @@
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
+
 #include "definitions.h"
+#include "mem_neon.h"

 /* Store half of a vector. */
 static INLINE void vsth_u8(uint8_t *ptr, uint8x8_t val) { vst1_lane_u32((uint32_t *)ptr, vreinterpret_u32_u8(val), 0); }
@@ -58,10 +60,7 @@ static INLINE int16x8_t predict_w8(const int16_t *pred_buf_q3, int16x8_t alpha_s

 static INLINE int16x8x2_t predict_w16(const int16_t *pred_buf_q3, int16x8_t alpha_sign, int abs_alpha_q12,
                                      int16x8_t dc) {
-    /* vld2q_s16 interleaves, which is not useful for prediction. vst1q_s16_x2
-    does not interleave, but is not currently available in the compilier used
-    by the AOM build system. */
-    const int16x8x2_t ac_q3         = vld2q_s16(pred_buf_q3);
+    const int16x8x2_t ac_q3         = vld1q_s16_x2(pred_buf_q3);
    const int16x8_t   ac_sign_0     = veorq_s16(alpha_sign, ac_q3.val[0]);
    const int16x8_t   ac_sign_1     = veorq_s16(alpha_sign, ac_q3.val[1]);
    const int16x8_t   scaled_luma_0 = vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12);
@@ -74,10 +73,7 @@ static INLINE int16x8x2_t predict_w16(const int16_t *pred_buf_q3, int16x8_t alph

 static INLINE int16x8x4_t predict_w32(const int16_t *pred_buf_q3, int16x8_t alpha_sign, int abs_alpha_q12,
                                      int16x8_t dc) {
-    /* vld4q_s16 interleaves, which is not useful for prediction. vst1q_s16_x4
-    does not interleave, but is not currently available in the compilier used
-    by the AOM build system. */
-    const int16x8x4_t ac_q3         = vld4q_s16(pred_buf_q3);
+    const int16x8x4_t ac_q3         = vld1q_s16_x4(pred_buf_q3);
    const int16x8_t   ac_sign_0     = veorq_s16(alpha_sign, ac_q3.val[0]);
    const int16x8_t   ac_sign_1     = veorq_s16(alpha_sign, ac_q3.val[1]);
    const int16x8_t   ac_sign_2     = veorq_s16(alpha_sign, ac_q3.val[2]);
@@ -118,16 +114,219 @@ void svt_aom_cfl_predict_lbd_neon(const int16_t *pred_buf_q3, uint8_t *pred, int
            } else if (width == 16) {
                const int16x8x2_t pred_vector = predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
                const uint8x8x2_t predun      = {{vqmovun_s16(pred_vector.val[0]), vqmovun_s16(pred_vector.val[1])}};
-                vst2_u8(dst, predun);
+                vst1_u8_x2(dst, predun);
            } else {
                const int16x8x4_t pred_vector = predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
                const uint8x8x4_t predun      = {{vqmovun_s16(pred_vector.val[0]),
                                                  vqmovun_s16(pred_vector.val[1]),
                                                  vqmovun_s16(pred_vector.val[2]),
                                                  vqmovun_s16(pred_vector.val[3])}};
-                vst4_u8(dst, predun);
+                vst1_u8_x4(dst, predun);
            }
            dst += dst_stride;
        } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
    }
 }
+
+static INLINE uint16x4_t clamp_s16(int16x4_t a, int16x4_t max) {
+    return vreinterpret_u16_s16(vmax_s16(vmin_s16(a, max), vdup_n_s16(0)));
+}
+
+static INLINE uint16x8_t clampq_s16(int16x8_t a, int16x8_t max) {
+    return vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(a, max), vdupq_n_s16(0)));
+}
+
+static INLINE uint16x8x2_t clamp2q_s16(int16x8x2_t a, int16x8_t max) {
+    uint16x8x2_t result;
+    result.val[0] = vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0)));
+    result.val[1] = vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0)));
+    return result;
+}
+
+static INLINE uint16x8x4_t clamp4q_s16(int16x8x4_t a, int16x8_t max) {
+    uint16x8x4_t result;
+    result.val[0] = vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0)));
+    result.val[1] = vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0)));
+    result.val[2] = vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(a.val[2], max), vdupq_n_s16(0)));
+    result.val[3] = vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(a.val[3], max), vdupq_n_s16(0)));
+    return result;
+}
+
+void svt_cfl_predict_hbd_neon(const int16_t *pred_buf_q3, uint16_t *pred, int pred_stride, uint16_t *dst,
+                              int dst_stride, int alpha_q3, int bd, int width, int height) {
+    (void)pred_stride;
+    (void)bd; // bd is assumed to be 10.
+    const int            max           = (1 << 10) - 1;
+    const int16_t        abs_alpha_q12 = abs(alpha_q3) << 9;
+    const int16_t *const end           = pred_buf_q3 + height * CFL_BUF_LINE;
+    if (width == 4) {
+        const int16x4_t alpha_sign = vdup_n_s16(alpha_q3);
+        const int16x4_t dc         = vdup_n_s16(*pred);
+        const int16x4_t max_16x4   = vdup_n_s16(max);
+        do {
+            const int16x4_t scaled_luma = predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+            vst1_u16(dst, clamp_s16(scaled_luma, max_16x4));
+            dst += dst_stride;
+            pred_buf_q3 += CFL_BUF_LINE;
+        } while (pred_buf_q3 < end);
+    } else {
+        const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3);
+        const int16x8_t dc         = vdupq_n_s16(*pred);
+        const int16x8_t max_16x8   = vdupq_n_s16(max);
+        do {
+            if (width == 8) {
+                const int16x8_t pred_v = predict_w8(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+                vst1q_u16(dst, clampq_s16(pred_v, max_16x8));
+            } else if (width == 16) {
+                const int16x8x2_t pred_v = predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+                vst1q_u16_x2(dst, clamp2q_s16(pred_v, max_16x8));
+            } else {
+                const int16x8x4_t pred_v = predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+                vst1q_u16_x4(dst, clamp4q_s16(pred_v, max_16x8));
+            }
+            dst += dst_stride;
+            pred_buf_q3 += CFL_BUF_LINE;
+        } while (pred_buf_q3 < end);
+    }
+}
+
+void svt_cfl_luma_subsampling_420_lbd_neon(const uint8_t *input, int input_stride, int16_t *pred_buf_q3, int width,
+                                           int height) {
+    const int16_t *end         = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+    const int      luma_stride = input_stride << 1;
+    if (width == 4) {
+        do {
+            const uint8x8_t top = load_unaligned_u8(input, luma_stride);
+            const uint8x8_t bot = load_unaligned_u8(input + input_stride, luma_stride);
+            uint16x4_t      sum = vpaddl_u8(top);
+            sum                 = vpadal_u8(sum, bot);
+            sum                 = vadd_u16(sum, sum);
+
+            store_s16x2_strided_x2(pred_buf_q3, CFL_BUF_LINE, vreinterpret_s16_u16(sum));
+
+            input += 2 * luma_stride;
+            pred_buf_q3 += 2 * CFL_BUF_LINE;
+        } while (pred_buf_q3 < end);
+    } else if (width == 8) {
+        do {
+            uint8x16_t top = load_u8_8x2(input, luma_stride);
+            uint8x16_t bot = load_u8_8x2(input + input_stride, luma_stride);
+
+            uint16x8_t sum = vpaddlq_u8(top);
+            sum            = vpadalq_u8(sum, bot);
+            sum            = vaddq_u16(sum, sum);
+
+            store_s16x4_strided_x2(pred_buf_q3, CFL_BUF_LINE, vreinterpretq_s16_u16(sum));
+
+            input += 2 * luma_stride;
+            pred_buf_q3 += 2 * CFL_BUF_LINE;
+        } while (pred_buf_q3 < end);
+    } else if (width == 16) {
+        do {
+            const uint8x16_t top = vld1q_u8(input);
+            const uint8x16_t bot = vld1q_u8(input + input_stride);
+
+            uint16x8_t sum = vpaddlq_u8(top);
+            sum            = vpadalq_u8(sum, bot);
+            sum            = vaddq_u16(sum, sum);
+
+            vst1q_s16(pred_buf_q3, vreinterpretq_s16_u16(sum));
+
+            input += luma_stride;
+            pred_buf_q3 += CFL_BUF_LINE;
+        } while (pred_buf_q3 < end);
+    } else {
+        do {
+            const uint8x16x2_t top = vld1q_u8_x2(input);
+            const uint8x16x2_t bot = vld1q_u8_x2(input + input_stride);
+
+            uint16x8_t sum0 = vpaddlq_u8(top.val[0]);
+            uint16x8_t sum1 = vpaddlq_u8(top.val[1]);
+            sum0            = vpadalq_u8(sum0, bot.val[0]);
+            sum1            = vpadalq_u8(sum1, bot.val[1]);
+
+            sum0 = vaddq_u16(sum0, sum0);
+            sum1 = vaddq_u16(sum1, sum1);
+
+            vst1q_s16(pred_buf_q3 + 0, vreinterpretq_s16_u16(sum0));
+            vst1q_s16(pred_buf_q3 + 8, vreinterpretq_s16_u16(sum1));
+
+            input += luma_stride;
+            pred_buf_q3 += CFL_BUF_LINE;
+        } while (pred_buf_q3 < end);
+    }
+}
+
+void svt_cfl_luma_subsampling_420_hbd_neon(const uint16_t *input, int input_stride, int16_t *pred_buf_q3, int width,
+                                           int height) {
+    const int16_t *end         = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+    const int      luma_stride = input_stride << 1;
+    if (width == 4) {
+        do {
+            const uint16x8_t top = load_unaligned_u16_4x2(input, luma_stride);
+            const uint16x8_t bot = load_unaligned_u16_4x2(input + input_stride, luma_stride);
+
+            uint16x8_t sum = vaddq_u16(top, bot);
+            sum            = vpaddq_u16(sum, sum);
+            sum            = vaddq_u16(sum, sum);
+
+            store_s16x2_strided_x2(pred_buf_q3, CFL_BUF_LINE, vget_low_s16(vreinterpretq_s16_u16(sum)));
+
+            input += 2 * luma_stride;
+            pred_buf_q3 += 2 * CFL_BUF_LINE;
+        } while (pred_buf_q3 < end);
+    } else if (width == 8) {
+        do {
+            uint16x8_t top0, top1, bot0, bot1;
+            load_u16_8x2(input, luma_stride, &top0, &top1);
+            load_u16_8x2(input + input_stride, luma_stride, &bot0, &bot1);
+
+            uint16x8_t sum0  = vaddq_u16(top0, bot0);
+            uint16x8_t sum1  = vaddq_u16(top1, bot1);
+            uint16x8_t sum01 = vpaddq_u16(sum0, sum1);
+            sum01            = vaddq_u16(sum01, sum01);
+
+            store_s16x4_strided_x2(pred_buf_q3, CFL_BUF_LINE, vreinterpretq_s16_u16(sum01));
+
+            input += 2 * luma_stride;
+            pred_buf_q3 += 2 * CFL_BUF_LINE;
+        } while (pred_buf_q3 < end);
+    } else if (width == 16) {
+        do {
+            uint16x8_t top0, top1, bot0, bot1;
+            load_u16_8x2(input + 0, input_stride, &top0, &bot0);
+            load_u16_8x2(input + 8, input_stride, &top1, &bot1);
+
+            uint16x8_t sum0  = vaddq_u16(top0, bot0);
+            uint16x8_t sum1  = vaddq_u16(top1, bot1);
+            uint16x8_t sum01 = vpaddq_u16(sum0, sum1);
+            sum01            = vaddq_u16(sum01, sum01);
+
+            vst1q_s16(pred_buf_q3, vreinterpretq_s16_u16(sum01));
+
+            input += luma_stride;
+            pred_buf_q3 += CFL_BUF_LINE;
+        } while (pred_buf_q3 < end);
+    } else if (width == 32) {
+        do {
+            uint16x8_t top[4], bot[4];
+            load_u16_8x4(input, 8, &top[0], &top[1], &top[2], &top[3]);
+            load_u16_8x4(input + input_stride, 8, &bot[0], &bot[1], &bot[2], &bot[3]);
+
+            uint16x8_t sum0 = vaddq_u16(top[0], bot[0]);
+            uint16x8_t sum1 = vaddq_u16(top[1], bot[1]);
+            uint16x8_t sum2 = vaddq_u16(top[2], bot[2]);
+            uint16x8_t sum3 = vaddq_u16(top[3], bot[3]);
+
+            uint16x8_t sum01 = vpaddq_u16(sum0, sum1);
+            uint16x8_t sum23 = vpaddq_u16(sum2, sum3);
+            sum01            = vaddq_u16(sum01, sum01);
+            sum23            = vaddq_u16(sum23, sum23);
+
+            store_s16_8x2(pred_buf_q3, 8, vreinterpretq_s16_u16(sum01), vreinterpretq_s16_u16(sum23));
+
+            input += luma_stride;
+            pred_buf_q3 += CFL_BUF_LINE;
+        } while (pred_buf_q3 < end);
+    }
+}
--- a/Source/Lib/ASM_NEON/deblocking_filter_intrinsic_neon.c
+++ b/Source/Lib/ASM_NEON/deblocking_filter_intrinsic_neon.c
--- a/Source/Lib/ASM_NEON/highbd_loopfilter_neon.c
+++ b/Source/Lib/ASM_NEON/highbd_loopfilter_neon.c
@@ -301,7 +301,8 @@ void svt_aom_highbd_lpf_horizontal_6_neon(uint16_t *s, int pitch, const uint8_t
    uint16x8_t p0q0_output, p1q1_output;
    uint16x8_t f6_p1q1, f6_p0q0;
    // Not needing filter4() at all is a very common case, so isolate it to avoid needlessly computing filter4().
-    if (vaddlv_u16(vand_u16(is_flat3_mask, needs_filter_mask)) == (1 << 18) - 4) {
+    if (vget_lane_s64(vreinterpret_s64_u16(is_flat3_mask), 0) == -1 &&
+        vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) {
        filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
        p1q1_output = f6_p1q1;
        p0q0_output = f6_p0q0;
@@ -376,7 +377,7 @@ void svt_aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch, const uint8_t *b
    filter6_masks(
        p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd, &needs_filter_mask, &is_flat3_mask, &hev_mask);

-    if (vaddv_u16(needs_filter_mask) == 0) {
+    if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) {
        // None of the values will be filtered.
        return;
    }
@@ -389,7 +390,8 @@ void svt_aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch, const uint8_t *b
    uint16x8_t       f6_p1q1, f6_p0q0;
    const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
    // Not needing filter4() at all is a very common case, so isolate it to avoid needlessly computing filter4().
-    if (vaddlv_u16(vand_u16(is_flat3_mask, needs_filter_mask)) == (1 << 18) - 4) {
+    if (vget_lane_s64(vreinterpret_s64_u16(is_flat3_mask), 0) == -1 &&
+        vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) {
        filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
        p1q1_output = f6_p1q1;
        p0q0_output = f6_p0q0;
@@ -564,7 +566,8 @@ void svt_aom_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch, const uint8_t
    uint16x8_t p0q0_output, p1q1_output, p2q2_output;
    uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
    // Not needing filter4() at all is a very common case, so isolate it to avoid needlessly computing filter4().
-    if (vaddlv_u16(vand_u16(is_flat4_mask, needs_filter_mask)) == (1 << 18) - 4) {
+    if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 &&
+        vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) {
        filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
        p2q2_output = f8_p2q2;
        p1q1_output = f8_p1q1;
@@ -664,7 +667,8 @@ void svt_aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch, const uint8_t *b
    uint16x8_t p0q0_output, p1q1_output, p2q2_output;
    uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
    // Not needing filter4() at all is a very common case, so isolate it to avoid needlessly computing filter4().
-    if (vaddlv_u16(vand_u16(is_flat4_mask, needs_filter_mask)) == (1 << 18) - 4) {
+    if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 &&
+        vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) {
        filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
        p2q2_output = f8_p2q2;
        p1q1_output = f8_p1q1;
@@ -862,7 +866,7 @@ void svt_aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch, const uint8_t
    uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output, p5q5_output;
    uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
    uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
-    if ((vaddlv_u16(is_flat4_outer_mask) == (1 << 18) - 4)) {
+    if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_outer_mask), 0) == -1) {
        // filter14() applies to all values.
        filter14(
            p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
@@ -872,7 +876,8 @@ void svt_aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch, const uint8_t
        p2q2_output = f14_p2q2;
        p1q1_output = f14_p1q1;
        p0q0_output = f14_p0q0;
-    } else if ((vaddlv_u16(is_flat4_mask) == (1 << 18) - 4)) {
+    } else if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 &&
+               vget_lane_u64(vreinterpret_u64_u16(is_flat4_outer_mask), 0) == 0) {
        // filter8() applies to all values.
        filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
        p5q5_output = p5q5;
@@ -1036,7 +1041,7 @@ void svt_aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch, const uint8_t *
    uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output, p5q5_output;
    uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
    uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
-    if ((vaddlv_u16(is_flat4_outer_mask) == (1 << 18) - 4)) {
+    if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_outer_mask), 0) == -1) {
        // filter14() applies to all values.
        filter14(
            p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
@@ -1046,7 +1051,8 @@ void svt_aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch, const uint8_t *
        p2q2_output = f14_p2q2;
        p1q1_output = f14_p1q1;
        p0q0_output = f14_p0q0;
-    } else if ((vaddlv_u16(is_flat4_mask) == (1 << 18) - 4)) {
+    } else if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 &&
+               vget_lane_u64(vreinterpret_u64_u16(is_flat4_outer_mask), 0) == 0) {
        // filter8() applies to all values.
        filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
        p5q5_output = p5q5;

--- a/Source/Lib/ASM_NEON/intra_prediction_neon.c
+++ b/Source/Lib/ASM_NEON/intra_prediction_neon.c
@@ -3359,3 +3359,43 @@ void svt_av1_filter_intra_edge_neon(uint8_t *p, int sz, int strength) {
        }
    }
 }
+
+void svt_av1_upsample_intra_edge_neon(uint8_t *p, int sz) {
+    if (!sz) {
+        return;
+    }
+
+    assert(sz <= MAX_UPSAMPLE_SZ);
+
+    uint8_t        edge[MAX_UPSAMPLE_SZ + 3];
+    const uint8_t *src = edge;
+
+    // Copy p[-1..(sz-1)] and pad out both ends.
+    edge[0] = p[-1];
+    edge[1] = p[-1];
+    memcpy(edge + 2, p, sz);
+    edge[sz + 2] = p[sz - 1];
+    p[-2]        = p[-1];
+
+    uint8_t *dst = p - 1;
+
+    do {
+        uint8x8_t s0 = vld1_u8(src);
+        uint8x8_t s1 = vld1_u8(src + 1);
+        uint8x8_t s2 = vld1_u8(src + 2);
+        uint8x8_t s3 = vld1_u8(src + 3);
+
+        int16x8_t t0 = vreinterpretq_s16_u16(vaddl_u8(s0, s3));
+        int16x8_t t1 = vreinterpretq_s16_u16(vaddl_u8(s1, s2));
+        t1           = vmulq_n_s16(t1, 9);
+        t1           = vsubq_s16(t1, t0);
+
+        uint8x8x2_t res = {{vqrshrun_n_s16(t1, 4), s2}};
+
+        vst2_u8(dst, res);
+
+        src += 8;
+        dst += 16;
+        sz -= 8;
+    } while (sz > 0);
+}
--- a/Source/Lib/ASM_NEON/mem_neon.h
+++ b/Source/Lib/ASM_NEON/mem_neon.h
@@ -653,6 +653,12 @@ static INLINE void store_s16_8x2(int16_t *s, ptrdiff_t dst_stride, const int16x8
        memcpy(dst, &a, 4);                                          \
    } while (0)

+#define store_s16_2x1_lane(dst, src, lane)                          \
+    do {                                                            \
+        int32_t a = vget_lane_s32(vreinterpret_s32_s16(src), lane); \
+        memcpy(dst, &a, 4);                                         \
+    } while (0)
+
 #define store_u16_4x1_lane(dst, src, lane)                             \
    do {                                                               \
        uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \
@@ -1327,6 +1333,13 @@ static inline void store_u16x2_strided_x2(uint16_t *dst, uint32_t dst_stride, ui
    store_u16_2x1_lane(dst, src, 1);
 }

+// Store two blocks of 32-bits from a single vector.
+static inline void store_s16x2_strided_x2(int16_t *dst, uint32_t dst_stride, int16x4_t src) {
+    store_s16_2x1_lane(dst, src, 0);
+    dst += dst_stride;
+    store_s16_2x1_lane(dst, src, 1);
+}
+
 // Store two blocks of 64-bits from a single vector.
 static INLINE void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride, uint16x8_t src) {
    store_u16_4x1_lane(dst, src, 0);
@@ -1335,6 +1348,7 @@ static INLINE void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride, ui
 }

 #undef store_u16_2x1_lane
+#undef store_s16_2x1_lane
 #undef store_u16_4x1_lane
 #undef store_s16_4x1_lane


--- a/Source/Lib/ASM_NEON/restoration_pick_neon.c
+++ b/Source/Lib/ASM_NEON/restoration_pick_neon.c
--- a/Source/Lib/Codec/aom_dsp_rtcd.c
+++ b/Source/Lib/Codec/aom_dsp_rtcd.c
@@ -562,7 +562,7 @@ void svt_aom_setup_rtcd_internal(EbCpuFlags flags) {
    SET_NEON(svt_av1_lowbd_pixel_proj_error, svt_av1_lowbd_pixel_proj_error_c, svt_av1_lowbd_pixel_proj_error_neon);
    SET_ONLY_C(svt_av1_highbd_pixel_proj_error, svt_av1_highbd_pixel_proj_error_c);
    SET_ONLY_C(svt_subtract_average, svt_subtract_average_c);
-    SET_ONLY_C(svt_get_proj_subspace, svt_get_proj_subspace_c);
+    SET_NEON(svt_get_proj_subspace, svt_get_proj_subspace_c, svt_get_proj_subspace_neon);
    SET_NEON(svt_aom_quantize_b, svt_aom_quantize_b_c_ii, svt_aom_quantize_b_neon);
    SET_NEON(svt_aom_highbd_quantize_b, svt_aom_highbd_quantize_b_c, svt_aom_highbd_quantize_b_neon);
    SET_NEON(svt_av1_quantize_b_qm, svt_aom_quantize_b_c_ii, svt_aom_quantize_b_neon);

--- a/Source/Lib/Codec/aom_dsp_rtcd.h
+++ b/Source/Lib/Codec/aom_dsp_rtcd.h
@@ -1172,7 +1172,7 @@ extern "C" {
    unsigned int svt_aom_obmc_sub_pixel_variance128x128_neon(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse);

    uint32_t svt_nxm_sad_kernel_helper_neon(const uint8_t *src, uint32_t src_stride, const uint8_t *ref, uint32_t ref_stride, uint32_t height, uint32_t width);
-
+    void svt_get_proj_subspace_neon(const uint8_t *src8, int32_t width, int32_t height, int32_t src_stride, const uint8_t *dat8, int32_t dat_stride, int32_t use_highbitdepth, int32_t *flt0, int32_t flt0_stride, int32_t *flt1, int32_t flt1_stride, int32_t *xq, const SgrParamsType *params);
    uint64_t svt_handle_transform16x64_neon(int32_t *output);
    uint64_t svt_handle_transform32x64_neon(int32_t *output);
    uint64_t svt_handle_transform64x16_neon(int32_t *output);

--- a/Source/Lib/Codec/common_dsp_rtcd.c
+++ b/Source/Lib/Codec/common_dsp_rtcd.c
@@ -1016,11 +1016,11 @@ void svt_aom_setup_common_rtcd_internal(EbCpuFlags flags) {
    SET_NEON(svt_aom_highbd_blend_a64_hmask_16bit, svt_aom_highbd_blend_a64_hmask_16bit_c, svt_aom_highbd_blend_a64_hmask_16bit_neon);
    SET_NEON(svt_aom_highbd_blend_a64_d16_mask, svt_aom_highbd_blend_a64_d16_mask_c, svt_aom_highbd_blend_a64_d16_mask_neon);
    SET_NEON(svt_cfl_predict_lbd, svt_cfl_predict_lbd_c, svt_aom_cfl_predict_lbd_neon);
-    SET_ONLY_C(svt_cfl_predict_hbd, svt_cfl_predict_hbd_c);
+    SET_NEON(svt_cfl_predict_hbd, svt_cfl_predict_hbd_c, svt_cfl_predict_hbd_neon);
    SET_NEON(svt_av1_filter_intra_predictor, svt_av1_filter_intra_predictor_c, svt_av1_filter_intra_predictor_neon);
    SET_NEON(svt_av1_filter_intra_edge_high, svt_av1_filter_intra_edge_high_c, svt_av1_filter_intra_edge_high_neon);
    SET_NEON(svt_av1_filter_intra_edge, svt_av1_filter_intra_edge_c, svt_av1_filter_intra_edge_neon);
-    SET_ONLY_C(svt_av1_upsample_intra_edge, svt_av1_upsample_intra_edge_c);
+    SET_NEON(svt_av1_upsample_intra_edge, svt_av1_upsample_intra_edge_c, svt_av1_upsample_intra_edge_neon);
    SET_NEON(svt_av1_build_compound_diffwtd_mask_d16, svt_av1_build_compound_diffwtd_mask_d16_c, svt_av1_build_compound_diffwtd_mask_d16_neon);
    SET_NEON(svt_av1_highbd_wiener_convolve_add_src, svt_av1_highbd_wiener_convolve_add_src_c, svt_av1_highbd_wiener_convolve_add_src_neon);
    SET_NEON(svt_apply_selfguided_restoration, svt_apply_selfguided_restoration_c, svt_aom_apply_selfguided_restoration_neon);
@@ -1051,8 +1051,8 @@ void svt_aom_setup_common_rtcd_internal(EbCpuFlags flags) {
    SET_ONLY_C(svt_unpack_avg, svt_unpack_avg_c);
    SET_ONLY_C(svt_unpack_avg_safe_sub, svt_unpack_avg_safe_sub_c);
    SET_ONLY_C(svt_un_pack8_bit_data, svt_un_pack8_bit_data_c);
-    SET_ONLY_C(svt_cfl_luma_subsampling_420_lbd, svt_cfl_luma_subsampling_420_lbd_c);
-    SET_ONLY_C(svt_cfl_luma_subsampling_420_hbd, svt_cfl_luma_subsampling_420_hbd_c);
+    SET_NEON(svt_cfl_luma_subsampling_420_lbd, svt_cfl_luma_subsampling_420_lbd_c, svt_cfl_luma_subsampling_420_lbd_neon);
+    SET_NEON(svt_cfl_luma_subsampling_420_hbd, svt_cfl_luma_subsampling_420_hbd_c, svt_cfl_luma_subsampling_420_hbd_neon);
    SET_ONLY_C(svt_convert_8bit_to_16bit, svt_convert_8bit_to_16bit_c);
    SET_ONLY_C(svt_convert_16bit_to_8bit, svt_convert_16bit_to_8bit_c);
    SET_NEON(svt_pack2d_16_bit_src_mul4, svt_enc_msb_pack2_d, svt_enc_msb_pack2d_neon);

--- a/Source/Lib/Codec/common_dsp_rtcd.h
+++ b/Source/Lib/Codec/common_dsp_rtcd.h
@@ -1171,6 +1171,7 @@ extern "C" {

    void svt_av1_filter_intra_edge_neon(uint8_t *p, int32_t sz, int32_t strength);
    void svt_av1_filter_intra_edge_high_neon(uint16_t *p, int32_t sz, int32_t strength);
+    void svt_av1_upsample_intra_edge_neon(uint8_t *p, int32_t sz);

    void svt_av1_dr_prediction_z1_neon(uint8_t *dst, ptrdiff_t stride, int32_t bw, int32_t bh, const uint8_t *above, const uint8_t *left,int32_t upsample_above, int32_t dx, int32_t dy);
    void svt_av1_dr_prediction_z2_neon(uint8_t *dst, ptrdiff_t stride, int32_t bw, int32_t bh, const uint8_t *above, const uint8_t *left, int32_t upsample_above, int32_t upsample_left, int32_t dx,int32_t dy);
@@ -1200,6 +1201,10 @@ extern "C" {
    void svt_aom_cdef_find_dir_dual_neon(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int32_t coeff_shift, uint8_t *out1, uint8_t *out2);

    void svt_aom_cfl_predict_lbd_neon(const int16_t *pred_buf_q3, uint8_t *pred, int32_t pred_stride, uint8_t *dst, int32_t dst_stride, int32_t alpha_q3, int32_t bit_depth, int32_t width, int32_t height);
+    void svt_cfl_predict_hbd_neon(const int16_t *pred_buf_q3, uint16_t *pred, int32_t pred_stride, uint16_t *dst, int32_t dst_stride, int32_t alpha_q3, int32_t bit_depth, int32_t width, int32_t height);
+
+    void svt_cfl_luma_subsampling_420_lbd_neon(const uint8_t *input, int32_t input_stride, int16_t *output_q3, int32_t width, int32_t height);
+    void svt_cfl_luma_subsampling_420_hbd_neon(const uint16_t *input, int32_t input_stride, int16_t *output_q3, int32_t width, int32_t height);

    void svt_aom_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, int32_t *coeff);
    void svt_aom_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, int32_t *coeff);

--- a/test/DeblockTest.cc
+++ b/test/DeblockTest.cc
@@ -107,13 +107,6 @@ class LoopFilterTest : public ::testing::TestWithParam<TestParamType> {
            buf[i] = val;
    }

-    void init_input_random(Sample *s, Sample *ref_s, ACMRandom *rnd) {
-        for (int i = 0; i < kNumCoeffs; ++i) {
-            s[i] = rnd->Rand16() & mask_;
-            ref_s[i] = s[i];
-        }
-    }
-
    virtual void run_lpf(LOOP_PARAM, int bd) {
        (void)p;
        (void)blimit;
@@ -122,6 +115,68 @@ class LoopFilterTest : public ::testing::TestWithParam<TestParamType> {
        (void)bd;
    }

+    void init_input(Sample *s, Sample *ref_s, ACMRandom *rnd,
+                    const uint8_t limit, const int mask, const int32_t p,
+                    const int i) {
+        uint16_t tmp_s[kNumCoeffs];
+
+        for (int j = 0; j < kNumCoeffs;) {
+            const uint8_t val = rnd->Rand8();
+            if (val & 0x80) {  // 50% chance to choose a new value.
+                tmp_s[j] = rnd->Rand16();
+                j++;
+            } else {  // 50% chance to repeat previous value in row X times.
+                int k = 0;
+                while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
+                    if (j < 1) {
+                        tmp_s[j] = rnd->Rand16();
+                    } else if (val & 0x20) {  // Increment by a value within the
+                                              // limit.
+                        tmp_s[j] =
+                            static_cast<uint16_t>(tmp_s[j - 1] + (limit - 1));
+                    } else {  // Decrement by a value within the limit.
+                        tmp_s[j] =
+                            static_cast<uint16_t>(tmp_s[j - 1] - (limit - 1));
+                    }
+                    j++;
+                }
+            }
+        }
+
+        for (int j = 0; j < kNumCoeffs;) {
+            const uint8_t val = rnd->Rand8();
+            if (val & 0x80) {
+                j++;
+            } else {  // 50% chance to repeat previous value in column X times.
+                int k = 0;
+                while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
+                    if (j < 1) {
+                        tmp_s[j] = rnd->Rand16();
+                    } else if (val & 0x20) {  // Increment by a value within the
+                                              // limit.
+                        tmp_s[(j % 32) * 32 + j / 32] = static_cast<uint16_t>(
+                            tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] +
+                            (limit - 1));
+                    } else {  // Decrement by a value within the limit.
+                        tmp_s[(j % 32) * 32 + j / 32] = static_cast<uint16_t>(
+                            tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] -
+                            (limit - 1));
+                    }
+                    j++;
+                }
+            }
+        }
+
+        for (int j = 0; j < kNumCoeffs; j++) {
+            if (i % 2) {
+                s[j] = tmp_s[j] & mask;
+            } else {
+                s[j] = tmp_s[p * (j % p) + j / p] & mask;
+            }
+            ref_s[j] = s[j];
+        }
+    }
+
    void run_test() {
        ACMRandom rnd(ACMRandom::DeterministicSeed());
        const int count_test_block = 10000;
@@ -148,7 +203,7 @@ class LoopFilterTest : public ::testing::TestWithParam<TestParamType> {
            init_buffer_with_value(thresh, 16, tmp);

            // Initial sample data
-            init_input_random(tst_s, ref_s, &rnd);
+            init_input(tst_s, ref_s, &rnd, *limit, mask_, p, i);

            // run the filters
            run_lpf(p, blimit, limit, thresh, bit_depth_);

--- a/test/SelfGuidedUtilTest.cc
+++ b/test/SelfGuidedUtilTest.cc
@@ -403,235 +403,178 @@ INSTANTIATE_TEST_SUITE_P(
    AVX2, PixelProjErrorHbdTest,
    ::testing::Values(make_tuple(svt_av1_highbd_pixel_proj_error_avx2,
                                 svt_av1_highbd_pixel_proj_error_c)));
+#endif  // ARCH_X86_64

-// test svt_get_proj_subspace
-TEST(SelfGuidedToolsTest, GetProjSubspaceMatchTest) {
-    const int32_t pu_width = RESTORATION_PROC_UNIT_SIZE;
-    const int32_t pu_height = RESTORATION_PROC_UNIT_SIZE;
-    const int32_t width = 270, height = 256, stride = 300, out_stride = 300;
-    const int NUM_ITERS = 2000;
-    int i, j, k;
-
-    uint8_t *input_ = (uint8_t *)svt_aom_memalign(
-        32, stride * (height + 32) * sizeof(uint8_t));
-    uint8_t *output_ = (uint8_t *)svt_aom_memalign(
-        32, out_stride * (height + 32) * sizeof(uint8_t));
-    int32_t *tmpbuf = (int32_t *)svt_aom_memalign(32, RESTORATION_TMPBUF_SIZE);
-    uint8_t *input = input_ + stride * 16 + 16;
-    uint8_t *output = output_ + out_stride * 16 + 16;
-    int32_t *flt0 = tmpbuf;
-    int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
-    int32_t flt_stride = ((width + 7) & ~7) + 8;
-
-    // check all the sg params
-    SVTRandom rnd(8, false);
-    for (int iter = 0; iter < NUM_ITERS; ++iter) {
-        if (iter == 0) {
-            // prepare src data and recon data
-            for (i = -16; i < height + 16; ++i) {
-                for (j = -16; j < width + 16; ++j) {
-                    input[i * stride + j] = rnd.random();
-                    if (iter == 0)
-                        output[i * stride + j] = input[i * stride + j];
-                    else
-                        output[i * stride + j] = rnd.random();
-                }
-            }
-        } else {
-            // prepare src data and recon data
-            for (i = -16; i < height + 16; ++i) {
-                for (j = -16; j < width + 16; ++j) {
-                    input[i * stride + j] = 0;
-                    if (iter == 0)
-                        output[i * stride + j] = input[i * stride + j];
-                    else
-                        output[i * stride + j] = 0;
-                }
-            }
-        }
+typedef void (*GetProjSubspaceFunc)(const uint8_t *src8, int32_t width,
+                                    int32_t height, int32_t src_stride,
+                                    const uint8_t *dat8, int32_t dat_stride,
+                                    int32_t use_highbitdepth, int32_t *flt0,
+                                    int32_t flt0_stride, int32_t *flt1,
+                                    int32_t flt1_stride, int32_t *xq,
+                                    const SgrParamsType *params);

-        for (int32_t ep = 0; ep < SGRPROJ_PARAMS; ++ep) {
-            // apply selfguided filter to get A and b
-            for (k = 0; k < height; k += pu_height) {
-                for (j = 0; j < width; j += pu_width) {
-                    int32_t w = AOMMIN(pu_width, width - j);
-                    int32_t h = AOMMIN(pu_height, height - k);
-                    uint8_t *output_p = output + k * out_stride + j;
-                    int32_t *flt0_p = flt0 + k * flt_stride + j;
-                    int32_t *flt1_p = flt1 + k * flt_stride + j;
-                    assert(w * h <= RESTORATION_UNITPELS_MAX);
-
-                    svt_av1_selfguided_restoration_avx2(output_p,
-                                                        w,
-                                                        h,
-                                                        out_stride,
-                                                        flt0_p,
-                                                        flt1_p,
-                                                        flt_stride,
-                                                        ep,
-                                                        8,
-                                                        0);
-                }
-            }
-
-            aom_clear_system_state();
-            int32_t xqd_c[2] = {0};
-            int32_t xqd_asm[2] = {0};
-            const SgrParamsType *const params = &svt_aom_eb_sgr_params[ep];
-            svt_get_proj_subspace_c(input,
-                                    width,
-                                    height,
-                                    stride,
-                                    output,
-                                    out_stride,
-                                    0,
-                                    flt0,
-                                    flt_stride,
-                                    flt1,
-                                    flt_stride,
-                                    xqd_c,
-                                    params);
-            svt_get_proj_subspace_avx2(input,
-                                       width,
-                                       height,
-                                       stride,
-                                       output,
-                                       out_stride,
-                                       0,
-                                       flt0,
-                                       flt_stride,
-                                       flt1,
-                                       flt_stride,
-                                       xqd_asm,
-                                       params);
-            ASSERT_EQ(xqd_c[0], xqd_asm[0])
-                << "xqd_asm[0] does not match with xqd_asm[0] with iter "
-                << iter << " ep " << ep;
-            ASSERT_EQ(xqd_c[1], xqd_asm[1])
-                << "xqd_asm[1] does not match with xqd_asm[1] with iter "
-                << iter << " ep " << ep;
-        }
+template <typename Sample>
+class GetProjSubspaceTest
+    : public ::testing::TestWithParam<GetProjSubspaceFunc> {
+  public:
+    void SetUp() override {
+        test_impl_ = GetParam();
+        input_ = (Sample *)svt_aom_memalign(
+            32, stride * (height + 32) * sizeof(Sample));
+        output_ = (Sample *)svt_aom_memalign(
+            32, out_stride * (height + 32) * sizeof(Sample));
+        tmpbuf_ = (int32_t *)svt_aom_memalign(32, RESTORATION_TMPBUF_SIZE);
    }

-    svt_aom_free(input_);
-    svt_aom_free(output_);
-    svt_aom_free(tmpbuf);
-}
+    void TearDown() override {
+        svt_aom_free(input_);
+        svt_aom_free(output_);
+        svt_aom_free(tmpbuf_);
+    }

-// test svt_get_proj_subspace
-TEST(SelfGuidedToolsTest, GetProjSubspaceMatchTestHbd) {
-    const int32_t pu_width = RESTORATION_PROC_UNIT_SIZE;
-    const int32_t pu_height = RESTORATION_PROC_UNIT_SIZE;
-    const int32_t width = 270, height = 256, stride = 300, out_stride = 300;
-    const int NUM_ITERS = 2000;
-    int i, j, k;
-
-    uint16_t *input_ = (uint16_t *)svt_aom_memalign(
-        32, stride * (height + 32) * sizeof(uint16_t));
-    uint16_t *output_ = (uint16_t *)svt_aom_memalign(
-        32, out_stride * (height + 32) * sizeof(uint16_t));
-    int32_t *tmpbuf = (int32_t *)svt_aom_memalign(32, RESTORATION_TMPBUF_SIZE);
-    uint16_t *input = input_ + stride * 16 + 16;
-    uint16_t *output = output_ + out_stride * 16 + 16;
-    int32_t *flt0 = tmpbuf;
-    int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
-    int32_t flt_stride = ((width + 7) & ~7) + 8;
-
-    // check all the sg params
-    SVTRandom rnd(8, false);
-    for (int iter = 0; iter < NUM_ITERS; ++iter) {
-        if (iter == 0) {
-            // prepare src data and recon data
-            for (i = -16; i < height + 16; ++i) {
-                for (j = -16; j < width + 16; ++j) {
-                    input[i * stride + j] = rnd.random();
-                    if (iter == 0)
-                        output[i * stride + j] = input[i * stride + j];
-                    else
-                        output[i * stride + j] = rnd.random();
-                }
-            }
-        } else {
-            // prepare src data and recon data
-            for (i = -16; i < height + 16; ++i) {
-                for (j = -16; j < width + 16; ++j) {
-                    input[i * stride + j] = 0;
-                    if (iter == 0)
-                        output[i * stride + j] = input[i * stride + j];
-                    else
-                        output[i * stride + j] = 0;
+    void run_test() {
+        const int32_t pu_width = RESTORATION_PROC_UNIT_SIZE;
+        const int32_t pu_height = RESTORATION_PROC_UNIT_SIZE;
+        const int NUM_ITERS = 2000;
+        int i, j, k;
+        Sample *input = input_ + stride * 16 + 16;
+        Sample *output = output_ + out_stride * 16 + 16;
+        int32_t width_sample[] = {128, 192, 256, 270};
+        for (int w_index = 0;
+             w_index < (int)(sizeof(width_sample) / sizeof(width_sample[0]));
+             w_index++) {
+            int32_t width = width_sample[w_index];
+            int32_t *flt0 = tmpbuf_;
+            int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+            int32_t flt_stride = ((width + 7) & ~7) + 8;
+
+            // check all the sg params
+            SVTRandom rnd(8, false);
+            for (int iter = 0; iter < NUM_ITERS; ++iter) {
+                // prepare src data and recon data
+                for (i = -16; i < height + 16; ++i) {
+                    for (j = -16; j < width + 16; ++j) {
+                        if (iter == 0)
+                            output[i * stride + j] = input[i * stride + j] =
+                                rnd.random();
+                        else if (iter == 1)
+                            output[i * stride + j] = input[i * stride + j] = 0;
+                        else {
+                            input[i * stride + j] = rnd.random();
+                            output[i * stride + j] = rnd.random();
+                        }
+                    }
                }
-            }
-        }

-        for (int32_t ep = 0; ep < SGRPROJ_PARAMS; ++ep) {
-            // apply selfguided filter to get A and b
-            for (k = 0; k < height; k += pu_height) {
-                for (j = 0; j < width; j += pu_width) {
-                    int32_t w = AOMMIN(pu_width, width - j);
-                    int32_t h = AOMMIN(pu_height, height - k);
-                    uint16_t *output_p = output + k * out_stride + j;
-                    int32_t *flt0_p = flt0 + k * flt_stride + j;
-                    int32_t *flt1_p = flt1 + k * flt_stride + j;
-                    assert(w * h <= RESTORATION_UNITPELS_MAX);
-
-                    svt_av1_selfguided_restoration_avx2((uint8_t *)output_p,
-                                                        w,
-                                                        h,
-                                                        out_stride,
-                                                        flt0_p,
-                                                        flt1_p,
-                                                        flt_stride,
-                                                        ep,
-                                                        8,
-                                                        0);
+                for (int32_t ep = 0; ep < SGRPROJ_PARAMS; ++ep) {
+                    // apply selfguided filter to get A and b
+                    for (k = 0; k < height; k += pu_height) {
+                        for (j = 0; j < width; j += pu_width) {
+                            int32_t w = AOMMIN(pu_width, width - j);
+                            int32_t h = AOMMIN(pu_height, height - k);
+                            Sample *output_p = output + k * out_stride + j;
+                            int32_t *flt0_p = flt0 + k * flt_stride + j;
+                            int32_t *flt1_p = flt1 + k * flt_stride + j;
+                            assert(w * h <= RESTORATION_UNITPELS_MAX);
+
+                            svt_av1_selfguided_restoration_c(
+                                (uint8_t *)output_p,
+                                w,
+                                h,
+                                out_stride,
+                                flt0_p,
+                                flt1_p,
+                                flt_stride,
+                                ep,
+                                8,
+                                0);
+                        }
+                    }
+
+                    aom_clear_system_state();
+                    int32_t xqd_c[2] = {0};
+                    int32_t xqd_asm[2] = {0};
+                    const SgrParamsType *const params =
+                        &svt_aom_eb_sgr_params[ep];
+                    uint8_t *input_p = sizeof(*input) == sizeof(uint16_t)
+                                           ? (CONVERT_TO_BYTEPTR(input))
+                                           : (uint8_t *)input;
+                    uint8_t *output_p = sizeof(*output) == sizeof(uint16_t)
+                                            ? (CONVERT_TO_BYTEPTR(output))
+                                            : (uint8_t *)output;
+                    int32_t use_highbitdepth =
+                        sizeof(*input) == sizeof(uint16_t) ? 1 : 0;
+
+                    svt_get_proj_subspace_c(input_p,
+                                            width,
+                                            height,
+                                            stride,
+                                            output_p,
+                                            out_stride,
+                                            use_highbitdepth,
+                                            flt0,
+                                            flt_stride,
+                                            flt1,
+                                            flt_stride,
+                                            xqd_c,
+                                            params);
+                    test_impl_(input_p,
+                               width,
+                               height,
+                               stride,
+                               output_p,
+                               out_stride,
+                               use_highbitdepth,
+                               flt0,
+                               flt_stride,
+                               flt1,
+                               flt_stride,
+                               xqd_asm,
+                               params);
+                    ASSERT_EQ(xqd_c[0], xqd_asm[0])
+                        << "xqd_asm[0] does not match with xqd_asm[0] with "
+                           "iter "
+                        << iter << " ep " << ep;
+                    ASSERT_EQ(xqd_c[1], xqd_asm[1])
+                        << "xqd_asm[1] does not match with xqd_asm[1] with "
+                           "iter "
+                        << iter << " ep " << ep;
                }
            }
-
-            aom_clear_system_state();
-            int32_t xqd_c[2] = {0};
-            int32_t xqd_asm[2] = {0};
-            const SgrParamsType *const params = &svt_aom_eb_sgr_params[ep];
-            svt_get_proj_subspace_c(CONVERT_TO_BYTEPTR(input),
-                                    width,
-                                    height,
-                                    stride,
-                                    CONVERT_TO_BYTEPTR(output),
-                                    out_stride,
-                                    1,
-                                    flt0,
-                                    flt_stride,
-                                    flt1,
-                                    flt_stride,
-                                    xqd_c,
-                                    params);
-            svt_get_proj_subspace_avx2(CONVERT_TO_BYTEPTR(input),
-                                       width,
-                                       height,
-                                       stride,
-                                       CONVERT_TO_BYTEPTR(output),
-                                       out_stride,
-                                       1,
-                                       flt0,
-                                       flt_stride,
-                                       flt1,
-                                       flt_stride,
-                                       xqd_asm,
-                                       params);
-
-            ASSERT_EQ(xqd_c[0], xqd_asm[0])
-                << "xqd_asm[0] does not match with xqd_asm[0] with iter "
-                << iter << " ep " << ep;
-            ASSERT_EQ(xqd_c[1], xqd_asm[1])
-                << "xqd_asm[1] does not match with xqd_asm[1] with iter "
-                << iter << " ep " << ep;
        }
    }

-    svt_aom_free(input_);
-    svt_aom_free(output_);
-    svt_aom_free(tmpbuf);
+  private:
+    static const int32_t height = 256, stride = 300, out_stride = 300;
+    GetProjSubspaceFunc test_impl_;
+    Sample *input_;
+    Sample *output_;
+    int32_t *tmpbuf_;
+};
+
+using GetProjSubspaceTestLbd = GetProjSubspaceTest<uint8_t>;
+using GetProjSubspaceTestHbd = GetProjSubspaceTest<uint16_t>;
+
+TEST_P(GetProjSubspaceTestLbd, MatchTest) {
+    run_test();
 }
+TEST_P(GetProjSubspaceTestHbd, MatchTest) {
+    run_test();
+}
+
+#if ARCH_X86_64
+INSTANTIATE_TEST_SUITE_P(AVX2, GetProjSubspaceTestLbd,
+                         ::testing::Values(svt_get_proj_subspace_avx2));
+
+INSTANTIATE_TEST_SUITE_P(AVX2, GetProjSubspaceTestHbd,
+                         ::testing::Values(svt_get_proj_subspace_avx2));
 #endif  // ARCH_X86_64

+#if ARCH_AARCH64
+INSTANTIATE_TEST_SUITE_P(NEON, GetProjSubspaceTestLbd,
+                         ::testing::Values(svt_get_proj_subspace_neon));
+INSTANTIATE_TEST_SUITE_P(NEON, GetProjSubspaceTestHbd,
+                         ::testing::Values(svt_get_proj_subspace_neon));
+#endif  // ARCH_AARCH64
+
 }  // namespace
--- a/test/intrapred_cfl_test.cc
+++ b/test/intrapred_cfl_test.cc
@@ -71,8 +71,11 @@ class CflPredTest : public ::testing::TestWithParam<FuncType> {
        SVTRandom pred_rnd(bd_ + 3 + 1, true);
        SVTRandom dst_rnd(8, false);
        for (int tx = TX_4X4; tx < TX_SIZES_ALL; ++tx) {
-            const int c_w = tx_size_wide[tx] >> 1;
-            const int c_h = tx_size_high[tx] >> 1;
+            const int c_w = tx_size_wide[tx];
+            const int c_h = tx_size_high[tx];
+            if (c_w > 32 || c_h > 32) {
+                continue;
+            }
            const int c_stride = CFL_BUF_LINE;
            memset(pred_buf_q3, 0, sizeof(pred_buf_q3));
            memset(dst_buf_ref_data_, 0, sizeof(dst_buf_ref_data_));
@@ -177,7 +180,6 @@ class HbdCflPredTest : public CflPredTest<uint16_t, CFL_PRED_HBD> {
        common_init();
    }
 };
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HbdCflPredTest);

 TEST_P(HbdCflPredTest, MatchTest) {
    RunAllTest();
@@ -188,6 +190,11 @@ INSTANTIATE_TEST_SUITE_P(AVX2, HbdCflPredTest,
                         ::testing::Values(svt_cfl_predict_hbd_avx2));
 #endif  // ARCH_X86_64

+#ifdef ARCH_AARCH64
+INSTANTIATE_TEST_SUITE_P(NEON, HbdCflPredTest,
+                         ::testing::Values(svt_cfl_predict_hbd_neon));
+#endif  // ARCH_AARCH64
+
 typedef void (*AomUpsampledPredFunc)(MacroBlockD *,
                                     const struct AV1Common *const, int, int,
                                     const MV *const, uint8_t *, int, int, int,
@@ -320,12 +327,11 @@ class CflLumaSubsamplingLbdTest
        const int block_size = TEST_GET_PARAM(0);
        CflLumaSubsamplingLbdFunc test_impl = TEST_GET_PARAM(1);
        const int width = block_size_wide[block_size];
-        // Output width is defined by CFL_BUF_LINE(32),
-        // it lead to assumption that input width cannot be larger than 64,
-        // otherwise computation will overwrite line "n" by line "n+1"
-        if (width > 64)
-            return;
        const int height = block_size_high[block_size];
+        // CFL prediction only operates on blocks where
+        // max(width, height) <= 32.
+        if (width > 32 || height > 32)
+            return;
        DECLARE_ALIGNED(16, uint8_t, input[MAX_SB_SQUARE]);
        DECLARE_ALIGNED(16, int16_t, output_q3_ref_[MAX_SB_SQUARE]);
        DECLARE_ALIGNED(16, int16_t, output_q3_tst_[MAX_SB_SQUARE]);
@@ -368,6 +374,14 @@ INSTANTIATE_TEST_SUITE_P(
        ::testing::Values(svt_cfl_luma_subsampling_420_lbd_avx2)));
 #endif

+#ifdef ARCH_AARCH64
+INSTANTIATE_TEST_SUITE_P(
+    NEON, CflLumaSubsamplingLbdTest,
+    ::testing::Combine(
+        ::testing::Range(BLOCK_4X4, BlockSizeS_ALL),
+        ::testing::Values(svt_cfl_luma_subsampling_420_lbd_neon)));
+#endif
+
 typedef void (*CflLumaSubsamplingHbdFunc)(const uint16_t *, int32_t, int16_t *,
                                          int32_t, int32_t);
 typedef ::testing::tuple<BlockSize, CflLumaSubsamplingHbdFunc>
@@ -388,12 +402,11 @@ class CflLumaSubsamplingHbdTest
        const int block_size = TEST_GET_PARAM(0);
        CflLumaSubsamplingHbdFunc test_impl = TEST_GET_PARAM(1);
        const int width = block_size_wide[block_size];
-        // Output width is defined by CFL_BUF_LINE(32),
-        // it lead to assumption that input width cannot be larger than 64,
-        // otherwise computation will overwrite line "n" by line "n+1"
-        if (width > 64)
-            return;
        const int height = block_size_high[block_size];
+        // CFL prediction only operates on blocks where
+        // max(width, height) <= 32.
+        if (width > 32 || height > 32)
+            return;
        DECLARE_ALIGNED(16, uint16_t, input[MAX_SB_SQUARE]);
        DECLARE_ALIGNED(16, int16_t, output_q3_ref_[MAX_SB_SQUARE]);
        DECLARE_ALIGNED(16, int16_t, output_q3_tst_[MAX_SB_SQUARE]);
@@ -436,4 +449,12 @@ INSTANTIATE_TEST_SUITE_P(
        ::testing::Values(svt_cfl_luma_subsampling_420_hbd_avx2)));
 #endif  // ARCH_X86_64

+#ifdef ARCH_AARCH64
+INSTANTIATE_TEST_SUITE_P(
+    NEON, CflLumaSubsamplingHbdTest,
+    ::testing::Combine(
+        ::testing::Range(BLOCK_4X4, BlockSizeS_ALL),
+        ::testing::Values(svt_cfl_luma_subsampling_420_hbd_neon)));
+#endif  // ARCH_AARCH64
+
 }  // namespace
--- a/test/intrapred_edge_filter_test.cc
+++ b/test/intrapred_edge_filter_test.cc
@@ -117,7 +117,6 @@ class UpsampleTest : public ::testing::TestWithParam<UPSAMPLE_LBD> {
    int numPx_;
    int bd_;
 };
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(UpsampleTest);

 TEST_P(UpsampleTest, RunTest) {
    RunTest();
@@ -128,6 +127,11 @@ INSTANTIATE_TEST_SUITE_P(SSE4_1, UpsampleTest,
                         ::testing::Values(svt_av1_upsample_intra_edge_sse4_1));
 #endif  // ARCH_X86_64

+#if ARCH_AARCH64
+INSTANTIATE_TEST_SUITE_P(NEON, UpsampleTest,
+                         ::testing::Values(svt_av1_upsample_intra_edge_neon));
+#endif  // ARCH_AARCH64
+
 // -----------------------------------------------------------------------------
 // Filter edge Tests
 // Declare macros and functions requried
No results found