Commit 43563102 by Christophe Gonzales

first step to adding support for missing values detection in databases

parent 736a3ee7
Pipeline #21468646 passed with stages
in 85 minutes 14 seconds
......@@ -254,6 +254,7 @@ namespace gum {
translatorType[i] = db.translator(i).getValType ();
}
DBRow<DBTranslatedValue> xrow ( __nbVars );
const auto xmiss = gum::learning::DatabaseTable<>::IsMissing::False;
for (const auto& row : __database) {
for (Idx i = 0; i < __nbVars; ++i) {
Idx j = __varOrder.at(i);
......@@ -264,7 +265,7 @@ namespace gum {
xrow[i].cont_val = float ( row.at(j) );
}
}
db.insertRow(xrow);
db.insertRow(xrow, xmiss);
}
return db;
......
......@@ -350,6 +350,9 @@ namespace gum {
/// returns the allocator used by the translator
allocator_type getAllocator () const;
/// indicates whether a translated value corresponds to a missing value
bool isMissingValue ( const DBTranslatedValue& val ) const;
/// @}
......
......@@ -343,6 +343,31 @@ namespace gum {
std::string translateBackSafe ( const DBTranslatedValue translated_val,
const std::size_t k ) const;
/** @brief indicates whether the kth translator considers a translated_val
* as a missing value
*
* @param translated_val the value that we compare to the translation of
* a missing value
* @param k the index of the translator that performed the translation
* @warning this method assumes that there are at least k translators.
* So, it won't check that the kth translator actually exists. If unsure,
* use method isMissingValueSafe that performs this check.
*/
bool isMissingValue ( const DBTranslatedValue translated_val,
const std::size_t k ) const;
/** @brief similar to method isMissingValue, except that it checks that
* the kth translator exists
*
* @param translated_val the value that we compare to the translation of
* a missing value
* @param k the index of the translator that performed the translation
* @throw UndefinedElement is raised if there are fewer than k
* translators in the translator set.
*/
bool isMissingValueSafe ( const DBTranslatedValue translated_val,
const std::size_t k ) const;
/// returns the domain size of the variable stored into the kth translator
/** @warning this method assumes that there are at least k translators.
* So, it won't check that the kth translator actually exists. If unsure,
......
......@@ -405,6 +405,29 @@ namespace gum {
return __translators[k]->translateBack ( translated_val );
}
// indicates whether the kth translator considers a translated_val
// as a missing value
template <template<typename> class ALLOC>
INLINE bool
DBTranslatorSet<ALLOC>::isMissingValue( const DBTranslatedValue translated_val,
const std::size_t k ) const {
return __translators[k]->isMissingValue ( translated_val );
}
// indicates whether the kth translator considers a translated_val
// as a missing value
template <template<typename> class ALLOC>
INLINE bool
DBTranslatorSet<ALLOC>::isMissingValueSafe (
const DBTranslatedValue translated_val,
const std::size_t k ) const {
if ( __translators.size () <= k )
GUM_ERROR ( UndefinedElement, "the translator could not be found" );
return __translators[k]->isMissingValue ( translated_val );
}
/// returns the kth translator
template <template<typename> class ALLOC>
......
......@@ -235,7 +235,26 @@ namespace gum {
DBTranslator<ALLOC>::setVariableDescription ( const std::string& str ) const {
const_cast<Variable*> ( this->variable () )->setDescription ( str );
}
/// indicates whether a translated value corresponds to a missing value
template <template<typename> class ALLOC>
INLINE bool
DBTranslator<ALLOC>::isMissingValue ( const DBTranslatedValue& value ) const {
switch ( _val_type ) {
case DBTranslatedValueType::DISCRETE:
return value.discr_val == std::numeric_limits<std::size_t>::max ();
case DBTranslatedValueType::CONTINUOUS:
return value.cont_val == std::numeric_limits<float>::max ();
default:
GUM_ERROR ( NotImplementedYet,
"No missing value interpretation for this "
"translated value type" );
}
}
} /* namespace learning */
......
......@@ -275,6 +275,8 @@ namespace gum {
template <template<typename> class XALLOC>
using MissingValType = std::vector<std::string,XALLOC<std::string>>;
enum IsMissing : char { True, False };
/** @class Handler
......@@ -997,7 +999,9 @@ namespace gum {
* as is into the database table.
* @throw SizeError is raised if the size of the new_row is not equal to
* the number of columns retained in the IDatabaseTable */
virtual void insertRow( Row<T_DATA>&& new_row );
virtual void
insertRow( Row<T_DATA>&& new_row,
const IsMissing contains_missing_data );
/// insert a new row at the end of the database
/** Unlike methods insertRow for data whose type is different from T_DATA,
......@@ -1006,7 +1010,9 @@ namespace gum {
* as is into the database table.
* @throw SizeError is raised if the size of the new_row is not equal to
* the number of columns retained in the IDatabaseTable */
virtual void insertRow( const Row<T_DATA>& new_row );
virtual void
insertRow( const Row<T_DATA>& new_row,
const IsMissing contains_missing_data );
using IDatabaseTableInsert4DBCell<ALLOC,
!std::is_same<T_DATA,DBCell>::value>::insertRows;
......@@ -1018,7 +1024,8 @@ namespace gum {
* copied as is into the database table.
* @throw SizeError is raised if the size of at least one row in new_rows
* is not equal to the number of columns retained in the IDatabaseTable */
virtual void insertRows( Matrix<T_DATA>&& new_rows );
virtual void insertRows( Matrix<T_DATA>&& new_rows,
const DBVector<IsMissing>& rows_have_missing_vals );
/// insert a set of new DBRows at the end of the database
/** Unlike methods insertRows for data whose type is different from T_DATA,
......@@ -1027,7 +1034,8 @@ namespace gum {
* copied as is into the database table.
* @throw SizeError is raised if the size of at least one row in new_rows
* is not equal to the number of columns retained in the IDatabaseTable */
virtual void insertRows( const Matrix<T_DATA>& new_rows );
virtual void insertRows( const Matrix<T_DATA>& new_rows,
const DBVector<IsMissing>& rows_have_missing_vals );
/// erase a given row specified by its index in the table
/** In the database, rows are indexed, starting from 0.
......@@ -1103,6 +1111,9 @@ namespace gum {
// the set of string corresponding to missing values
DBVector<std::string> __missing_symbols;
// a vector indicating which rows have missing values (char != 0)
DBVector<IsMissing> __has_row_missing_val;
// the list of handlers currently attached to the database
/* this is useful when the database is resized */
mutable DBVector<HandlerSafe*> __list_of_safe_handlers;
......
......@@ -516,6 +516,7 @@ namespace gum {
, _variable_names ( alloc )
, __data ( alloc )
, __missing_symbols ( alloc )
, __has_row_missing_val ( alloc )
, __list_of_safe_handlers ( alloc ) {
// copy the names
_variable_names.reserve ( var_names.size () );
......@@ -543,6 +544,7 @@ namespace gum {
, _variable_names( from._variable_names, alloc )
, __data( from.__data, alloc )
, __missing_symbols ( from.__missing_symbols, alloc )
, __has_row_missing_val ( from.__has_row_missing_val, alloc )
, __list_of_safe_handlers ( alloc ) {
// create the end iterators
__createEndIterators ();
......@@ -567,6 +569,7 @@ namespace gum {
, _variable_names( std::move( from._variable_names ), alloc )
, __data( std::move( from.__data ), alloc )
, __missing_symbols ( std::move ( from.__missing_symbols ), alloc )
, __has_row_missing_val ( std::move( from.__has_row_missing_val ), alloc )
, __list_of_safe_handlers ( alloc ) {
// create the end iterators
__createEndIterators ();
......@@ -627,6 +630,7 @@ namespace gum {
__data = from.__data;
_variable_names = from._variable_names;
__missing_symbols = from.__missing_symbols;
__has_row_missing_val = from.__has_row_missing_val;
// update the end iterators
const std::size_t db_size = __data.size();
......@@ -661,6 +665,7 @@ namespace gum {
__data = std::move( from.__data );
_variable_names = std::move( from._variable_names );
__missing_symbols = std::move ( from.__missing_symbols );
__has_row_missing_val = std::move ( from.__has_row_missing_val );
// update the end iterators
const std::size_t db_size = __data.size();
......@@ -881,31 +886,43 @@ namespace gum {
// insert a new DBRow at the end of the database
template <typename T_DATA, template<typename> class ALLOC>
void IDatabaseTable<T_DATA,ALLOC>::insertRow (
typename IDatabaseTable<T_DATA,ALLOC>::template Row<T_DATA>&& new_row ) {
typename IDatabaseTable<T_DATA,ALLOC>::template Row<T_DATA>&& new_row,
const typename IDatabaseTable<T_DATA,ALLOC>::IsMissing contains_missing ) {
// check that the size of the row is the same as the rest of the database
if ( ! _isRowSizeOK ( new_row.size () ) )
GUM_ERROR( SizeError,
"the new row has not the same size as the "
"rest of the database" );
__updateHandlers( __data.size() + 1 );
__data.push_back( std::move( new_row ) );
try {
__has_row_missing_val.push_back( contains_missing );
}
catch ( ... ) {
__data.pop_back ();
throw;
}
}
// insert a new DBRow at the end of the database
template <typename T_DATA, template<typename> class ALLOC>
INLINE void IDatabaseTable<T_DATA,ALLOC>::insertRow
( const typename IDatabaseTable<T_DATA,ALLOC>::template Row<T_DATA>& row ) {
( const typename IDatabaseTable<T_DATA,ALLOC>::template Row<T_DATA>& row,
const typename IDatabaseTable<T_DATA,ALLOC>::IsMissing contains_missing ) {
this->insertRow ( typename IDatabaseTable<T_DATA,ALLOC>::template
Row<T_DATA> ( row ) );
Row<T_DATA> ( row ), contains_missing );
}
// insert a set of new DBRow at the end of the database
template <typename T_DATA, template<typename> class ALLOC>
void IDatabaseTable<T_DATA,ALLOC>::insertRows
( typename IDatabaseTable<T_DATA,ALLOC>::template Matrix<T_DATA>&& new_rows ) {
( typename IDatabaseTable<T_DATA,ALLOC>::template Matrix<T_DATA>&& new_rows,
const typename IDatabaseTable<T_DATA,ALLOC>::template
DBVector<typename IDatabaseTable<T_DATA,ALLOC>::IsMissing>&
rows_have_missing_vals ) {
if ( new_rows.empty() ) return;
// check that all the rows have the same size
......@@ -928,9 +945,18 @@ namespace gum {
"number of columns in the database" );
}
std::size_t nb_new_rows = new_rows.size ();
try {
for ( auto row : new_rows ) {
__data.push_back( std::move( row ) );
for ( std::size_t i = std::size_t(0); i < nb_new_rows; ++i ) {
__data.push_back( std::move( new_rows[i] ) );
try {
__has_row_missing_val.push_back( rows_have_missing_vals[i] );
}
catch ( ... ) {
__data.pop_back ();
throw;
}
++db_size;
}
}
......@@ -947,7 +973,10 @@ namespace gum {
template <typename T_DATA, template<typename> class ALLOC>
void IDatabaseTable<T_DATA,ALLOC>::insertRows
( const typename IDatabaseTable<T_DATA,ALLOC>::template
Matrix<T_DATA>& new_rows ) {
Matrix<T_DATA>& new_rows,
const typename IDatabaseTable<T_DATA,ALLOC>::template
DBVector<typename IDatabaseTable<T_DATA,ALLOC>::IsMissing>&
rows_have_missing_vals ) {
if ( new_rows.empty() ) return;
// check that all the rows have the same size
......@@ -971,9 +1000,18 @@ namespace gum {
"number of columns in the database" );
}
std::size_t nb_new_rows = new_rows.size ();
try {
for ( const auto& row : new_rows ) {
__data.push_back( row );
for ( std::size_t i = std::size_t(0); i < nb_new_rows; ++i ) {
__data.push_back( new_rows[i] );
try {
__has_row_missing_val.push_back( rows_have_missing_vals[i] );
}
catch ( ... ) {
__data.pop_back ();
throw;
}
++db_size;
}
}
......@@ -994,6 +1032,7 @@ namespace gum {
if ( index < db_size ) {
__updateHandlers( db_size - 1 );
__data.erase( __data.begin() + index );
__has_row_missing_val.erase( __has_row_missing_val.begin() + index );
}
}
......@@ -1006,6 +1045,7 @@ namespace gum {
if ( db_size ) {
__updateHandlers( db_size - 1 );
__data.pop_back();
__has_row_missing_val.pop_back();
}
}
......@@ -1018,6 +1058,7 @@ namespace gum {
if ( db_size ) {
__updateHandlers( db_size - 1 );
__data.erase( __data.begin() );
__has_row_missing_val.erase( __has_row_missing_val.begin() );
}
}
......@@ -1027,6 +1068,7 @@ namespace gum {
INLINE void IDatabaseTable<T_DATA,ALLOC>::eraseAllRows() {
__updateHandlers( 0 );
__data.clear();
__has_row_missing_val.clear();
}
......@@ -1041,6 +1083,8 @@ namespace gum {
} else {
__updateHandlers( db_size - nb_rows );
__data.erase( __data.begin(), __data.begin() + nb_rows );
__has_row_missing_val.erase( __has_row_missing_val.begin(),
__has_row_missing_val.begin() + nb_rows );
}
}
......@@ -1057,6 +1101,9 @@ namespace gum {
__updateHandlers( db_size - nb_rows );
__data.erase( __data.begin() + ( db_size - nb_rows ),
__data.begin() + db_size );
__has_row_missing_val.erase( __has_row_missing_val.begin() +
( db_size - nb_rows ),
__has_row_missing_val.begin() + db_size );
}
}
......@@ -1078,6 +1125,8 @@ namespace gum {
} else {
__updateHandlers( db_size - ( end - deb ) );
__data.erase( __data.begin() + deb, __data.begin() + end );
__has_row_missing_val.erase( __has_row_missing_val.begin() + deb,
__has_row_missing_val.begin() + end );
}
}
......@@ -1087,6 +1136,7 @@ namespace gum {
INLINE void IDatabaseTable<T_DATA,ALLOC>::clear() {
__updateHandlers( 0 );
__data.clear();
__has_row_missing_val.clear();
_variable_names.clear();
}
......
......@@ -201,6 +201,9 @@ namespace gum {
/// the safe handler type
using HandlerSafe =
typename IDatabaseTable<DBTranslatedValue,ALLOC>::HandlerSafe;
using IsMissing =
typename IDatabaseTable<DBTranslatedValue,ALLOC>::IsMissing;
/// Types for STL compliance.
/// @{
......@@ -554,7 +557,9 @@ namespace gum {
* basically, it could be copied as is into the database table.
* @throw SizeError is raised if the size of the new_row is not equal to
* the number of translators of the DatabaseTable */
virtual void insertRow( Row<DBTranslatedValue>&& new_row ) final;
virtual void
insertRow( Row<DBTranslatedValue>&& new_row,
const IsMissing contains_missing_data ) final;
/// insert a new row at the end of the database
/** Unlike methods insertRow for data whose type is different from
......@@ -563,7 +568,9 @@ namespace gum {
* basically, it could be copied as is into the database table.
* @throw SizeError is raised if the size of the new_row is not equal to
* the number of translators of the DatabaseTable */
virtual void insertRow( const Row<DBTranslatedValue>& new_row ) final;
virtual void
insertRow( const Row<DBTranslatedValue>& new_row,
const IsMissing contains_missing_data ) final;
/// insert a new DBRow of DBCells at the end of the database
/** The new_row passed in argument is supposed to come from an external
......@@ -590,7 +597,9 @@ namespace gum {
* these rows could be copied as is into the database table.
* @throw SizeError is raised if the size of at least one row in new_rows
* is not equal to the number of translators in the DatabaseTable */
virtual void insertRows( Matrix<DBTranslatedValue>&& new_rows ) final;
virtual void
insertRows( Matrix<DBTranslatedValue>&& new_rows,
const DBVector<IsMissing>& rows_have_missing_vals ) final;
/// insert a set of new DBRows at the end of the database
/** Unlike methods insertRows for data whose type is different from
......@@ -599,7 +608,9 @@ namespace gum {
* these rows could be copied as is into the database table.
* @throw SizeError is raised if the size of at least one row in new_rows
* is not equal to the number of translators in the DatabaseTable */
virtual void insertRows( const Matrix<DBTranslatedValue>& new_rows ) final;
virtual void
insertRows( const Matrix<DBTranslatedValue>& new_rows,
const DBVector<IsMissing>& rows_have_missing_vals ) final;
/// insert a set of new DBRows at the end of the database
/** The new rows passed in argument are supposed to come from an external
......
......@@ -616,11 +616,16 @@ namespace gum {
const std::size_t nb_trans = __translators.size ();
Row<DBTranslatedValue> dbrow;
dbrow.reserve ( nb_trans );
bool has_missing_val = false;
for ( std::size_t i = std::size_t (0); i < nb_trans; ++i ) {
dbrow.pushBack ( __translators.translate ( new_row, i ) );
const DBTranslatedValue new_val ( __translators.translate ( new_row, i ) );
if ( __translators.isMissingValue ( new_val, i ) )
has_missing_val = true;
dbrow.pushBack ( new_val );
}
this->insertRow ( std::move ( dbrow ) );
this->insertRow ( std::move ( dbrow ),
has_missing_val ? IsMissing::True : IsMissing::False );
}
......@@ -664,7 +669,8 @@ namespace gum {
/// insert a new DBRow at the end of the database
template <template<typename> class ALLOC>
INLINE void DatabaseTable<ALLOC>::insertRow(
typename DatabaseTable<ALLOC>::template Row<DBTranslatedValue>&& new_row ) {
typename DatabaseTable<ALLOC>::template Row<DBTranslatedValue>&& new_row,
const typename DatabaseTable<ALLOC>::IsMissing contains_missing_data ) {
// check that the new rows values are compatible with the values of
// the variables stored within the translators
if ( ! __isRowCompatible ( new_row ) ) {
......@@ -672,7 +678,8 @@ namespace gum {
"the new row is not compatible with the current translators" );
}
IDatabaseTable<DBTranslatedValue,ALLOC>::insertRow ( std::move ( new_row ) );
IDatabaseTable<DBTranslatedValue,ALLOC>::insertRow ( std::move ( new_row ),
contains_missing_data );
}
......@@ -680,7 +687,8 @@ namespace gum {
template <template<typename> class ALLOC>
INLINE void DatabaseTable<ALLOC>::insertRow(
const typename DatabaseTable<ALLOC>::template
Row<DBTranslatedValue>& new_row ) {
Row<DBTranslatedValue>& new_row,
const typename DatabaseTable<ALLOC>::IsMissing contains_missing_data ) {
// check that the new rows values are compatible with the values of
// the variables stored within the translators
if ( ! __isRowCompatible ( new_row ) ) {
......@@ -688,7 +696,8 @@ namespace gum {
"the new row is not compatible with the current translators" );
}
IDatabaseTable<DBTranslatedValue,ALLOC>::insertRow ( new_row );
IDatabaseTable<DBTranslatedValue,ALLOC>::insertRow ( new_row,
contains_missing_data );
}
......@@ -731,7 +740,9 @@ namespace gum {
/// insert a set of new DBRows at the end of the database
template <template<typename> class ALLOC>
void DatabaseTable<ALLOC>::insertRows(
typename DatabaseTable<ALLOC>::template Matrix<DBTranslatedValue>&& rows ) {
typename DatabaseTable<ALLOC>::template Matrix<DBTranslatedValue>&& rows,
const typename DatabaseTable<ALLOC>::template DBVector<IsMissing>&
rows_have_missing_vals ) {
// check that the new rows values are compatible with the values of
// the variables stored within the translators
for ( const auto& new_row : rows ) {
......@@ -741,7 +752,8 @@ namespace gum {
}
}
IDatabaseTable<DBTranslatedValue,ALLOC>::insertRows ( std::move ( rows ) );
IDatabaseTable<DBTranslatedValue,ALLOC>::insertRows(std::move ( rows ),
rows_have_missing_vals);
}
......@@ -749,7 +761,9 @@ namespace gum {
template <template<typename> class ALLOC>
void DatabaseTable<ALLOC>::insertRows(
const typename DatabaseTable<ALLOC>::template
Matrix<DBTranslatedValue>& new_rows ) {
Matrix<DBTranslatedValue>& new_rows,
const typename DatabaseTable<ALLOC>::template DBVector<IsMissing>&
rows_have_missing_vals ) {
// check that the new rows values are compatible with the values of
// the variables stored within the translators
for ( const auto& new_row : new_rows ) {
......@@ -759,7 +773,8 @@ namespace gum {
}
}
IDatabaseTable<DBTranslatedValue,ALLOC>::insertRows ( new_rows );
IDatabaseTable<DBTranslatedValue,ALLOC>::insertRows(new_rows,
rows_have_missing_vals);
}
......
......@@ -134,6 +134,8 @@ namespace gum {
/// the safe handler type
using HandlerSafe = typename IDatabaseTable<DBCell,ALLOC>::HandlerSafe;
using IsMissing = typename IDatabaseTable<DBCell,ALLOC>::IsMissing;
/// Types for STL compliance.
/// @{
using value_type = Row<DBCell>;
......
......@@ -365,29 +365,39 @@ namespace gum {
// create the dbrow that will contain the new data
Row<DBCell> dbrow;
dbrow.reserve ( row_size - ignored_size );
bool has_missing_val = false;
// translate the row into T_data and put them into the newly created dbrow
if ( ignored_size == 0 ) {
for ( const auto& elt : new_row )
dbrow.pushBack ( this->__convert ( elt ) );
for ( const auto& elt : new_row ) {
const DBCell new_cell ( this->__convert ( elt ) );
if ( new_cell.isMissing () ) has_missing_val = true;
dbrow.pushBack ( new_cell );
}
}
else {
for ( std::size_t i = std::size_t (0), j = std::size_t (0);
i < row_size; ++i ) {
if ( i != __ignored_cols[j] ) {
dbrow.pushBack ( this->__convert ( new_row[i] ) );
const DBCell new_cell ( this->__convert ( new_row[i] ) );
if ( new_cell.isMissing () ) has_missing_val = true;
dbrow.pushBack ( new_cell );
}
else {
if ( ++j == ignored_size ) {
for ( ++i; i < row_size; ++i ) {
dbrow.pushBack ( this->__convert ( new_row[i] ) );
const DBCell new_cell ( this->__convert ( new_row[i] ) );
if ( new_cell.isMissing () ) has_missing_val = true;
dbrow.pushBack ( new_cell );
}
}
}
}
}
IDatabaseTable<DBCell,ALLOC>::insertRow ( std::move ( dbrow ) );
IDatabaseTable<DBCell,ALLOC>::insertRow (
std::move ( dbrow ),
has_missing_val ? IsMissing::True : IsMissing::False );
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment