Commit 4009d63f authored by Sydney Hauke's avatar Sydney Hauke

SSE parallelization of absolute values, minimum and sign

parent 77ece07e
...@@ -44,6 +44,8 @@ inline __m128i CMOV_EPI16(__m128i dest, __m128i new_val, __m128i cmp0, __m128i c ...@@ -44,6 +44,8 @@ inline __m128i CMOV_EPI16(__m128i dest, __m128i new_val, __m128i cmp0, __m128i c
return _mm_or_si128(tmp0, tmp1); return _mm_or_si128(tmp0, tmp1);
} }
#define MIN(x, y) (y) ^ (((x) ^ (y)) & -((x) < (y)))
void inline void inline
CDecoder_MS_fixed_layered::cn_kernel7( CDecoder_MS_fixed_layered::cn_kernel7(
size_t cn_idx, size_t cn_idx,
...@@ -54,6 +56,7 @@ CDecoder_MS_fixed_layered::cn_kernel7( ...@@ -54,6 +56,7 @@ CDecoder_MS_fixed_layered::cn_kernel7(
int16_t v_to_c_msgs[SSE_16BIT_ELEM]; int16_t v_to_c_msgs[SSE_16BIT_ELEM];
int16_t abs_msgs[SSE_16BIT_ELEM]; int16_t abs_msgs[SSE_16BIT_ELEM];
int16_t new_msgs[SSE_16BIT_ELEM];
size_t cn_offset; size_t cn_offset;
int16_t global_sign, sign; int16_t global_sign, sign;
int16_t min1_LLR; int16_t min1_LLR;
...@@ -101,27 +104,65 @@ CDecoder_MS_fixed_layered::cn_kernel7( ...@@ -101,27 +104,65 @@ CDecoder_MS_fixed_layered::cn_kernel7(
abs_msgs[vn_idx] = abs_msg; abs_msgs[vn_idx] = abs_msg;
}*/ }*/
// For now just test abs and min1 // For now just test abs and min1
__m128i vc_msgs = _mm_loadu_si128((__m128i*)v_to_c_msgs); __m128i vc_msgs = _mm_loadu_si128((__m128i*)v_to_c_msgs);
__m128i abs_vec = _mm_abs_epi16(vc_msgs); __m128i abs_vec = _mm_abs_epi16(vc_msgs);
_mm_storeu_si128((__m128i*)abs_msgs, abs_vec); _mm_storeu_si128((__m128i*)abs_msgs, abs_vec);
min1_LLR = _mm_extract_epi16(_mm_minpos_epu16(abs_vec), 0); __m128i min1_res = _mm_minpos_epu16(abs_vec);
__m128i min1_vec = _mm_set1_epi16(min1_LLR); min1_LLR = _mm_extract_epi16(min1_res, 0);
int8_t min1_pos = _mm_extract_epi8(min1_res, 2);
// Find min1 in abs_vec, replace by 0x7FFF, and store in abs2_vec /*
__m128i abs2_vec = CMOV_EPI16(abs_vec, _mm_set1_epi16(INT16_MAX), min1_vec, abs_vec); abs_msgs[min1_pos] = 0x7FFF;
int16_t min2_LLR_sse = _mm_extract_epi16(_mm_minpos_epu16(abs2_vec), 0);
if(cn_deg & 0x1) { /* Compute min2 sequentially */
global_sign = ~global_sign; /*int16_t min0, min1, min2, min3;
}
min0 = MIN(abs_msgs[0], abs_msgs[1]);
min1 = MIN(abs_msgs[2], abs_msgs[3]);
min2 = MIN(abs_msgs[4], abs_msgs[5]);
min3 = abs_msgs[6];
min0 = MIN(min0, min1);
min2 = MIN(min2, min3);
min2_LLR = MIN(min0, min2);*/
__m128i pos_mask = _mm_set_epi16(7,6,5,4,3,2,1,0);
__m128i pos_brdcst = _mm_set1_epi16(min1_pos);
pos_mask = _mm_cmpeq_epi16(pos_mask, pos_brdcst);
abs_vec = _mm_or_si128(abs_vec, pos_mask);
min2_LLR = _mm_extract_epi16(_mm_minpos_epu16(abs_vec), 0);
//abs_msgs[min1_pos] = min1_LLR;
/* Invert sign because CN degree is odd */
global_sign = ~global_sign;
/* Broadcast the first minimum and second minimum*/
__m128i min1_brdcst = _mm_set1_epi16(min1_LLR);
__m128i min2_brdcst = _mm_set1_epi16(min2_LLR);
/* Fusion min1 vector and min2 such that min2 is placed at the position where
* we found min1 */
min2_brdcst = _mm_and_si128(pos_mask, min2_brdcst);
min1_brdcst = _mm_andnot_si128(pos_mask, min1_brdcst);
__m128i final_min = _mm_or_si128(min1_brdcst, min2_brdcst);
__m128i sign_vec = _mm_xor_si128(vc_msgs, _mm_set1_epi16(global_sign));
sign_vec = _mm_srai_epi16(sign_vec, 15);
sign_vec = _mm_or_si128(sign_vec, _mm_set1_epi16(1));
__m128i new_msgs_vec = _mm_mullo_epi16(sign_vec, final_min);
_mm_storeu_si128((__m128i*)new_msgs, new_msgs_vec);
for(size_t vn_idx = 0; vn_idx < cn_deg; vn_idx++) { for(size_t vn_idx = 0; vn_idx < cn_deg; vn_idx++) {
min = abs_msgs[vn_idx] != min1_LLR ? min1_LLR:min2_LLR; //min = abs_msgs[vn_idx] != min1_LLR ? min1_LLR:min2_LLR;
sign = (global_sign ^ v_to_c_msgs[vn_idx]); //min = mins[vn_idx];
sign = 1 | (sign >> (INT16_WIDTH-1)); // either -1 or 1 //sign = (global_sign ^ v_to_c_msgs[vn_idx]);
new_msg = sign * min; //sign = 1 | (sign >> (INT16_WIDTH-1)); // either -1 or 1
//new_msg = sign * min;
new_msg = new_msgs[vn_idx];
cv_msgs[cn_offset+vn_idx] = new_msg; cv_msgs[cn_offset+vn_idx] = new_msg;
*p_vn_addr[cn_offset+vn_idx] = new_msg + v_to_c_msgs[vn_idx]; *p_vn_addr[cn_offset+vn_idx] = new_msg + v_to_c_msgs[vn_idx];
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment