[Performances] Improve dtype conversion
During CPAReverse analysis, the dtype conversion from the samples format (often int8), to float32 or float64, necessary to leverage fast matrix dot product, takes about 50% of the time.
This numpy operation is single core and can be improved using numba.
The following code snipet seems to be efficient
@nb.njit(parallel=True)
def _fast_astype_core_F(data, out):
for i in nb.prange(data.shape[1]):
out[:, i] = data[:, i]
@nb.njit(parallel=True)
def _fast_astype_core_C(data, out):
for i in nb.prange(data.shape[0]):
out[i] = data[i]
def _data_order(data):
if data.flags.c_contiguous and data.flags.f_contiguous:
return 'FC'
if data.flags.c_contiguous:
return 'C'
if data.flags.f_contiguous:
return 'F'
def fast_astype(data, dtype='float32', order='C'):
dtype = np.dtype(dtype)
if data.dtype == dtype and order in _data_order(data):
return data
if data.ndim != 2:
return data.astype(dtype=dtype, order=order)
out = np.empty_like(data, order=order, dtype=dtype)
if order.upper() == 'C':
_fast_astype_core_C(data, out)
elif order.upper() == 'F':
_fast_astype_core_F(data, out)
else:
raise ValueError(f'Invalid order {order}.')
return out
aa = fast_astype(a)
%timeit fast_astype(a)
See a benchmark using 16 cores: