Commit 4009d63f authored by Sydney Hauke's avatar Sydney Hauke

SSE parallelization of absolute values, minimum and sign

parent 77ece07e
......@@ -44,6 +44,8 @@ inline __m128i CMOV_EPI16(__m128i dest, __m128i new_val, __m128i cmp0, __m128i c
return _mm_or_si128(tmp0, tmp1);
}
#define MIN(x, y) (y) ^ (((x) ^ (y)) & -((x) < (y)))
void inline
CDecoder_MS_fixed_layered::cn_kernel7(
size_t cn_idx,
......@@ -54,6 +56,7 @@ CDecoder_MS_fixed_layered::cn_kernel7(
int16_t v_to_c_msgs[SSE_16BIT_ELEM];
int16_t abs_msgs[SSE_16BIT_ELEM];
int16_t new_msgs[SSE_16BIT_ELEM];
size_t cn_offset;
int16_t global_sign, sign;
int16_t min1_LLR;
......@@ -101,27 +104,65 @@ CDecoder_MS_fixed_layered::cn_kernel7(
abs_msgs[vn_idx] = abs_msg;
}*/
// For now just test abs and min1
__m128i vc_msgs = _mm_loadu_si128((__m128i*)v_to_c_msgs);
__m128i abs_vec = _mm_abs_epi16(vc_msgs);
_mm_storeu_si128((__m128i*)abs_msgs, abs_vec);
min1_LLR = _mm_extract_epi16(_mm_minpos_epu16(abs_vec), 0);
__m128i min1_vec = _mm_set1_epi16(min1_LLR);
__m128i min1_res = _mm_minpos_epu16(abs_vec);
min1_LLR = _mm_extract_epi16(min1_res, 0);
int8_t min1_pos = _mm_extract_epi8(min1_res, 2);
// Find min1 in abs_vec, replace by 0x7FFF, and store in abs2_vec
__m128i abs2_vec = CMOV_EPI16(abs_vec, _mm_set1_epi16(INT16_MAX), min1_vec, abs_vec);
int16_t min2_LLR_sse = _mm_extract_epi16(_mm_minpos_epu16(abs2_vec), 0);
/*
abs_msgs[min1_pos] = 0x7FFF;
if(cn_deg & 0x1) {
global_sign = ~global_sign;
}
/* Compute min2 sequentially */
/*int16_t min0, min1, min2, min3;
min0 = MIN(abs_msgs[0], abs_msgs[1]);
min1 = MIN(abs_msgs[2], abs_msgs[3]);
min2 = MIN(abs_msgs[4], abs_msgs[5]);
min3 = abs_msgs[6];
min0 = MIN(min0, min1);
min2 = MIN(min2, min3);
min2_LLR = MIN(min0, min2);*/
__m128i pos_mask = _mm_set_epi16(7,6,5,4,3,2,1,0);
__m128i pos_brdcst = _mm_set1_epi16(min1_pos);
pos_mask = _mm_cmpeq_epi16(pos_mask, pos_brdcst);
abs_vec = _mm_or_si128(abs_vec, pos_mask);
min2_LLR = _mm_extract_epi16(_mm_minpos_epu16(abs_vec), 0);
//abs_msgs[min1_pos] = min1_LLR;
/* Invert sign because CN degree is odd */
global_sign = ~global_sign;
/* Broadcast the first minimum and second minimum*/
__m128i min1_brdcst = _mm_set1_epi16(min1_LLR);
__m128i min2_brdcst = _mm_set1_epi16(min2_LLR);
/* Fusion min1 vector and min2 such that min2 is placed at the position where
* we found min1 */
min2_brdcst = _mm_and_si128(pos_mask, min2_brdcst);
min1_brdcst = _mm_andnot_si128(pos_mask, min1_brdcst);
__m128i final_min = _mm_or_si128(min1_brdcst, min2_brdcst);
__m128i sign_vec = _mm_xor_si128(vc_msgs, _mm_set1_epi16(global_sign));
sign_vec = _mm_srai_epi16(sign_vec, 15);
sign_vec = _mm_or_si128(sign_vec, _mm_set1_epi16(1));
__m128i new_msgs_vec = _mm_mullo_epi16(sign_vec, final_min);
_mm_storeu_si128((__m128i*)new_msgs, new_msgs_vec);
for(size_t vn_idx = 0; vn_idx < cn_deg; vn_idx++) {
min = abs_msgs[vn_idx] != min1_LLR ? min1_LLR:min2_LLR;
sign = (global_sign ^ v_to_c_msgs[vn_idx]);
sign = 1 | (sign >> (INT16_WIDTH-1)); // either -1 or 1
new_msg = sign * min;
//min = abs_msgs[vn_idx] != min1_LLR ? min1_LLR:min2_LLR;
//min = mins[vn_idx];
//sign = (global_sign ^ v_to_c_msgs[vn_idx]);
//sign = 1 | (sign >> (INT16_WIDTH-1)); // either -1 or 1
//new_msg = sign * min;
new_msg = new_msgs[vn_idx];
cv_msgs[cn_offset+vn_idx] = new_msg;
*p_vn_addr[cn_offset+vn_idx] = new_msg + v_to_c_msgs[vn_idx];
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment