[Performances] Improve dtype conversion

During CPAReverse analysis, the dtype conversion from the samples format (often int8), to float32 or float64, necessary to leverage fast matrix dot product, takes about 50% of the time.

This numpy operation is single core and can be improved using numba.

The following code snipet seems to be efficient

@nb.njit(parallel=True)
def _fast_astype_core_F(data, out):
    for i in nb.prange(data.shape[1]):
        out[:, i] = data[:, i]
        
@nb.njit(parallel=True)
def _fast_astype_core_C(data, out):
    for i in nb.prange(data.shape[0]):
        out[i] = data[i]        

def _data_order(data):
    if data.flags.c_contiguous and data.flags.f_contiguous:
        return 'FC'
    if data.flags.c_contiguous:
        return 'C'
    if data.flags.f_contiguous:
        return 'F'    
           
def fast_astype(data, dtype='float32', order='C'):
    dtype = np.dtype(dtype)
    if data.dtype == dtype and order in _data_order(data):
        return data
    if data.ndim != 2:
        return data.astype(dtype=dtype, order=order)
    out = np.empty_like(data, order=order, dtype=dtype)
    if order.upper() == 'C':
        _fast_astype_core_C(data, out)
    elif order.upper() == 'F':
        _fast_astype_core_F(data, out)
    else:
        raise ValueError(f'Invalid order {order}.')
    return out

aa = fast_astype(a)
%timeit fast_astype(a)

See a benchmark using 16 cores: