Commit d5bd278b authored by Mark Abraham's avatar Mark Abraham
Browse files

Removed unnecessary inter-simulation signalling

Generally, multi-simulation runs do not need to couple the simulations
(discussion at #692). Individual algorithms implemented with
multi-simulations might need to do so, but should take care of their
own details, and now do. Scaling should improve in the cases where
simulations are now decoupled.

It is unclear what the expected behaviour of a multi-simulation should
be if the user supplies any of the possible non-uniform distributions
of init_step and nsteps, sourced from any of .mdp, .cpt or command
line. Instead, we report on the non-uniformity and proceed. It's
always possible that the user knows what they are doing. In
particular, now that multi-simulations are no longer explicitly
coupled, any heterogeneity in the execution environment will lead to
checkpoints and -maxh acting at different time steps, unless a
user-selected algorithm requires that the simulations stay coordinated
(e.g. REMD or ensemble restraints).

In the implementation of signalling, we have stopped checking gs for
NULL as a proxy for whether we should be doing signalling at that
communication phase. Replaced with a helper object in which explicit
flags are set. Added unit tests of that functionality.

Improved documentation of check_nstglobalcomm. mdrun now reports the
number of steps between intra-simulation communication to the
log file.

Noted minor TODOs for future cleanup.

Added some trivial test cases for termination by maxh in normal-MD,
multi-sim and REMD cases. Refactored multi-sim tests to make this
possible without duplication. This is complicated by the way filenames
get changed by mdrun -multi by the former par_fn, so cleaned up the
way that is handled so it can work and be re-used better. Introduced
mdrun integration-test object library to make that build system work a
little better. Made some minor improvements to Doxygen setup for
integration tests.

Fixes #860, #692, #1857, #1942.

Change-Id: I5f7b98f331db801b058ae2b196d79716b5912b09
parent 628f3b1c
......@@ -351,10 +351,13 @@ cases.
the minimum of :mdp:`nstcalcenergy` and :mdp:`nstlist`.
``mdrun -gcom`` sets the number of steps that must elapse between
such communication phases, which can improve performance when
running on a lot of nodes. Note that this means that _e.g._
running on a lot of ranks. Note that this means that _e.g._
temperature coupling algorithms will
effectively remain at constant energy until the next global
communication phase.
effectively remain at constant energy until the next
communication phase. :ref:`gmx mdrun` will always honor the
setting of ``mdrun -gcom``, by changing :mdp:`nstcalcenergy`,
:mdp:`nstenergy`, :mdp:`nstlog`, :mdp:`nsttcouple` and/or
:mdp:`nstpcouple` if necessary.
Note that ``-tunepme`` has more effect when there is more than one
:term:`node`, because the cost of communication for the PP and PME
......
......@@ -61,6 +61,9 @@
#include "gromacs/utility/pleasecite.h"
#include "gromacs/utility/smalloc.h"
// TODO This implementation of ensemble orientation restraints is nasty because
// a user can't just do multi-sim with single-sim orientation restraints.
void init_orires(FILE *fplog, const gmx_mtop_t *mtop,
rvec xref[],
const t_inputrec *ir,
......
......@@ -44,6 +44,8 @@
#include <cstdlib>
#include <cstring>
#include <string>
#include "gromacs/commandline/filenm.h"
#include "gromacs/fileio/gmxfio.h"
#include "gromacs/gmxlib/network.h"
......@@ -54,52 +56,17 @@
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/futil.h"
#include "gromacs/utility/gmxmpi.h"
#include "gromacs/utility/path.h"
#include "gromacs/utility/programcontext.h"
#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/snprintf.h"
#include "gromacs/utility/stringutil.h"
#include "gromacs/utility/sysinfo.h"
/* The source code in this file should be thread-safe.
Please keep it that way. */
#define BUFSIZE 1024
static void par_fn(char *base, int ftp, const t_commrec *cr,
gmx_bool bAppendSimId, gmx_bool bAppendNodeId,
char buf[], int bufsize)
{
if (static_cast<std::size_t>(bufsize) < (std::strlen(base)+10))
{
gmx_mem("Character buffer too small!");
}
/* Copy to buf, and strip extension */
std::strcpy(buf, base);
buf[strlen(base) - std::strlen(ftp2ext(fn2ftp(base))) - 1] = '\0';
if (bAppendSimId)
{
sprintf(buf+strlen(buf), "%d", cr->ms->sim);
}
if (bAppendNodeId)
{
std::strcat(buf, "_rank");
sprintf(buf+strlen(buf), "%d", cr->nodeid);
}
std::strcat(buf, ".");
/* Add extension again */
std::strcat(buf, (ftp == efTPR) ? "tpr" : (ftp == efEDR) ? "edr" : ftp2ext(ftp));
if (debug)
{
fprintf(debug, "rank %d par_fn '%s'\n", cr->nodeid, buf);
if (fn2ftp(buf) == efLOG)
{
fprintf(debug, "log\n");
}
}
}
// TODO move this to multi-sim module
void check_multi_int(FILE *log, const gmx_multisim_t *ms, int val,
const char *name,
gmx_bool bQuiet)
......@@ -151,6 +118,7 @@ void check_multi_int(FILE *log, const gmx_multisim_t *ms, int val,
sfree(ibuf);
}
// TODO move this to multi-sim module
void check_multi_int64(FILE *log, const gmx_multisim_t *ms,
gmx_int64_t val, const char *name,
gmx_bool bQuiet)
......@@ -189,6 +157,8 @@ void check_multi_int64(FILE *log, const gmx_multisim_t *ms,
}
else
{
// TODO Part of this error message would also be good to go to
// stderr (from one rank of one sim only)
if (NULL != log)
{
fprintf(log, "\n%s is not equal for all subsystems\n", name);
......@@ -267,12 +237,12 @@ void gmx_log_close(FILE *fp)
}
}
// TODO move this to multi-sim module
void init_multisystem(t_commrec *cr, int nsim, char **multidirs,
int nfile, const t_filenm fnm[], gmx_bool bParFn)
int nfile, const t_filenm fnm[])
{
gmx_multisim_t *ms;
int nnodes, nnodpersim, sim, i, ftp;
char buf[256];
int nnodes, nnodpersim, sim, i;
#if GMX_MPI
MPI_Group mpi_group_world;
int *rank;
......@@ -359,23 +329,30 @@ void init_multisystem(t_commrec *cr, int nsim, char **multidirs,
}
gmx_chdir(multidirs[cr->ms->sim]);
}
else if (bParFn)
else
{
/* Patch output and tpx, cpt and rerun input file names */
for (i = 0; (i < nfile); i++)
try
{
/* Because of possible multiple extensions per type we must look
* at the actual file name
*/
if (is_output(&fnm[i]) ||
fnm[i].ftp == efTPR || fnm[i].ftp == efCPT ||
strcmp(fnm[i].opt, "-rerun") == 0)
std::string rankString = gmx::formatString("%d", cr->ms->sim);
/* Patch output and tpx, cpt and rerun input file names */
for (i = 0; (i < nfile); i++)
{
ftp = fn2ftp(fnm[i].fns[0]);
par_fn(fnm[i].fns[0], ftp, cr, TRUE, FALSE, buf, 255);
sfree(fnm[i].fns[0]);
fnm[i].fns[0] = gmx_strdup(buf);
/* Because of possible multiple extensions per type we must look
* at the actual file name for rerun. */
if (is_output(&fnm[i]) ||
fnm[i].ftp == efTPR || fnm[i].ftp == efCPT ||
strcmp(fnm[i].opt, "-rerun") == 0)
{
std::string newFileName = gmx::Path::concatenateBeforeExtension(fnm[i].fns[0], rankString);
sfree(fnm[i].fns[0]);
fnm[i].fns[0] = gmx_strdup(newFileName.c_str());
}
}
}
catch (gmx::GromacsException &e)
{
e.prependContext(gmx::formatString("Failed to modify mdrun -multi filename to add per-simulation suffix. You could perhaps reorganize your files and try mdrun -multidir.\n"));
throw;
}
}
}
......@@ -3,7 +3,7 @@
*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2004, The GROMACS development team.
* Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
* Copyright (c) 2013,2014,2015,2016, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
......@@ -67,11 +67,10 @@ void check_multi_int64(FILE *log, const gmx_multisim_t *ms,
*/
void init_multisystem(t_commrec *cr, int nsim, char **multidirs,
int nfile, const t_filenm fnm[], gmx_bool bParFn);
int nfile, const t_filenm fnm[]);
/* Splits the communication into nsim separate simulations
* and creates a communication structure between the master
* these simulations.
* If bParFn is set, the nodeid is appended to the tpx and each output file.
*/
#endif
......@@ -49,8 +49,8 @@
#include "gromacs/gmxlib/nrnb.h"
#include "gromacs/math/vec.h"
#include "gromacs/mdlib/mdrun.h"
#include "gromacs/mdlib/mdrun_signalling.h"
#include "gromacs/mdlib/sim_util.h"
#include "gromacs/mdlib/simulationsignal.h"
#include "gromacs/mdlib/tgroup.h"
#include "gromacs/mdlib/update.h"
#include "gromacs/mdlib/vcm.h"
......@@ -68,48 +68,36 @@
#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/snprintf.h"
/* check which of the multisim simulations has the shortest number of
steps and return that number of nsteps */
gmx_int64_t get_multisim_nsteps(const t_commrec *cr,
gmx_int64_t nsteps)
// TODO move this to multi-sim module
bool multisim_int_all_are_equal(const gmx_multisim_t *ms,
gmx_int64_t value)
{
gmx_int64_t steps_out;
bool allValuesAreEqual = true;
gmx_int64_t *buf;
if (MASTER(cr))
{
gmx_int64_t *buf;
int s;
snew(buf, cr->ms->nsim);
GMX_RELEASE_ASSERT(ms, "Invalid use of multi-simulation pointer");
buf[cr->ms->sim] = nsteps;
gmx_sumli_sim(cr->ms->nsim, buf, cr->ms);
steps_out = -1;
for (s = 0; s < cr->ms->nsim; s++)
{
/* find the smallest positive number */
if (buf[s] >= 0 && ((steps_out < 0) || (buf[s] < steps_out)) )
{
steps_out = buf[s];
}
}
sfree(buf);
snew(buf, ms->nsim);
/* send our value to all other master ranks, receive all of theirs */
buf[ms->sim] = value;
gmx_sumli_sim(ms->nsim, buf, ms);
/* if we're the limiting simulation, don't do anything */
if (steps_out >= 0 && steps_out < nsteps)
for (int s = 0; s < ms->nsim; s++)
{
if (buf[s] != value)
{
char strbuf[255];
snprintf(strbuf, 255, "Will stop simulation %%d after %s steps (another simulation will end then).\n", "%" GMX_PRId64);
fprintf(stderr, strbuf, cr->ms->sim, steps_out);
allValuesAreEqual = false;
break;
}
}
/* broadcast to non-masters */
gmx_bcast(sizeof(gmx_int64_t), &steps_out, cr);
return steps_out;
sfree(buf);
return allValuesAreEqual;
}
int multisim_min(const gmx_multisim_t *ms, int nmin, int n)
......@@ -158,33 +146,6 @@ int multisim_min(const gmx_multisim_t *ms, int nmin, int n)
return nmin;
}
int multisim_nstsimsync(const t_commrec *cr,
const t_inputrec *ir, int repl_ex_nst)
{
int nmin;
if (MASTER(cr))
{
nmin = INT_MAX;
nmin = multisim_min(cr->ms, nmin, ir->nstlist);
nmin = multisim_min(cr->ms, nmin, ir->nstcalcenergy);
nmin = multisim_min(cr->ms, nmin, repl_ex_nst);
if (nmin == INT_MAX)
{
gmx_fatal(FARGS, "Can not find an appropriate interval for inter-simulation communication, since nstlist, nstcalcenergy and -replex are all <= 0");
}
/* Avoid inter-simulation communication at every (second) step */
if (nmin <= 2)
{
nmin = 10;
}
}
gmx_bcast(sizeof(int), &nmin, cr);
return nmin;
}
void copy_coupling_state(t_state *statea, t_state *stateb,
gmx_ekindata_t *ekinda, gmx_ekindata_t *ekindb, t_grpopts* opts)
{
......@@ -280,7 +241,7 @@ void compute_globals(FILE *fplog, gmx_global_stat *gstat, t_commrec *cr, t_input
t_nrnb *nrnb, t_vcm *vcm, gmx_wallcycle_t wcycle,
gmx_enerdata_t *enerd, tensor force_vir, tensor shake_vir, tensor total_vir,
tensor pres, rvec mu_tot, gmx_constr_t constr,
struct gmx_signalling_t *gs, gmx_bool bInterSimGS,
gmx::SimulationSignaller *signalCoordinator,
matrix box, int *totalNumberOfBondedInteractions,
gmx_bool *bSumEkinhOld, int flags)
{
......@@ -345,7 +306,7 @@ void compute_globals(FILE *fplog, gmx_global_stat *gstat, t_commrec *cr, t_input
}
else
{
gmx::ArrayRef<real> signalBuffer = prepareSignalBuffer(gs);
gmx::ArrayRef<real> signalBuffer = signalCoordinator->getCommunicationBuffer();
if (PAR(cr))
{
wallcycle_start(wcycle, ewcMoveE);
......@@ -356,7 +317,7 @@ void compute_globals(FILE *fplog, gmx_global_stat *gstat, t_commrec *cr, t_input
*bSumEkinhOld, flags);
wallcycle_stop(wcycle, ewcMoveE);
}
handleSignals(gs, cr, bInterSimGS);
signalCoordinator->finalizeSignals();
*bSumEkinhOld = FALSE;
}
}
......@@ -561,7 +522,7 @@ static int lcd4(int i1, int i2, int i3, int i4)
min_zero(&nst, i4);
if (nst == 0)
{
gmx_incons("All 4 inputs for determininig nstglobalcomm are <= 0");
gmx_incons("All 4 inputs for determining nstglobalcomm are <= 0");
}
while (nst > 1 && ((i1 > 0 && i1 % nst != 0) ||
......@@ -585,11 +546,15 @@ int check_nstglobalcomm(FILE *fplog, t_commrec *cr,
if (nstglobalcomm == -1)
{
// Set up the default behaviour
if (!(ir->nstcalcenergy > 0 ||
ir->nstlist > 0 ||
ir->etc != etcNO ||
ir->epc != epcNO))
{
/* The user didn't choose the period for anything
important, so we just make sure we can send signals and
write output suitably. */
nstglobalcomm = 10;
if (ir->nstenergy > 0 && ir->nstenergy < nstglobalcomm)
{
......@@ -598,8 +563,14 @@ int check_nstglobalcomm(FILE *fplog, t_commrec *cr,
}
else
{
/* Ensure that we do timely global communication for
* (possibly) each of the four following options.
/* The user has made a choice (perhaps implicitly), so we
* ensure that we do timely intra-simulation communication
* for (possibly) each of the four parts that care.
*
* TODO Does the Verlet scheme (+ DD) need any
* communication at nstlist steps? Is the use of nstlist
* here a leftover of the twin-range scheme? Can we remove
* nstlist when we remove the group scheme?
*/
nstglobalcomm = lcd4(ir->nstcalcenergy,
ir->nstlist,
......@@ -609,6 +580,7 @@ int check_nstglobalcomm(FILE *fplog, t_commrec *cr,
}
else
{
// Check that the user's choice of mdrun -gcom will work
if (ir->nstlist > 0 &&
nstglobalcomm > ir->nstlist && nstglobalcomm % ir->nstlist != 0)
{
......@@ -645,6 +617,10 @@ int check_nstglobalcomm(FILE *fplog, t_commrec *cr,
ir->nstcomm = nstglobalcomm;
}
if (fplog)
{
fprintf(fplog, "Intra-simulation communication will occur every %d steps.\n", nstglobalcomm);
}
return nstglobalcomm;
}
......
......@@ -54,6 +54,13 @@ struct t_nrnb;
struct t_state;
struct t_trxframe;
namespace gmx
{
class SimulationSignaller;
}
/* Define a number of flags to better control the information
* passed to compute_globals in md.c and global_stat.
*/
......@@ -82,9 +89,13 @@ struct t_trxframe;
#define CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS (1<<12)
/* return the number of steps between global communcations */
int check_nstglobalcomm(FILE *fplog, t_commrec *cr,
int nstglobalcomm, t_inputrec *ir);
/*! \brief Return the number of steps that will take place between
* intra-simulation communications, given the constraints of the
* inputrec and the value of mdrun -gcom. */
int check_nstglobalcomm(FILE *fplog,
t_commrec *cr,
int nstglobalcomm,
t_inputrec *ir);
/* check whether an 'nst'-style parameter p is a multiple of nst, and
set it to be one if not, with a warning. */
......@@ -92,10 +103,11 @@ void check_nst_param(FILE *fplog, t_commrec *cr,
const char *desc_nst, int nst,
const char *desc_p, int *p);
/* check which of the multisim simulations has the shortest number of
steps and return that number of nsteps */
gmx_int64_t get_multisim_nsteps(const t_commrec *cr,
gmx_int64_t nsteps);
/*! \brief Return true if the \p value is equal across the set of multi-simulations
*
* \todo This duplicates some of check_multi_int. Consolidate. */
bool multisim_int_all_are_equal(const gmx_multisim_t *ms,
gmx_int64_t value);
void rerun_parallel_comm(t_commrec *cr, t_trxframe *fr,
gmx_bool *bNotLastFrame);
......@@ -111,10 +123,6 @@ void set_current_lambdas(gmx_int64_t step, t_lambda *fepvals, gmx_bool bRerunMD,
int multisim_min(const gmx_multisim_t *ms, int nmin, int n);
/* Set an appropriate value for n across the whole multi-simulation */
int multisim_nstsimsync(const t_commrec *cr,
const t_inputrec *ir, int repl_ex_nst);
/* Determine the interval for inter-simulation communication */
void copy_coupling_state(t_state *statea, t_state *stateb,
gmx_ekindata_t *ekinda, gmx_ekindata_t *ekindb, t_grpopts* opts);
/* Copy stuff from state A to state B */
......@@ -125,7 +133,7 @@ void compute_globals(FILE *fplog, gmx_global_stat *gstat, t_commrec *cr, t_input
t_nrnb *nrnb, t_vcm *vcm, gmx_wallcycle_t wcycle,
gmx_enerdata_t *enerd, tensor force_vir, tensor shake_vir, tensor total_vir,
tensor pres, rvec mu_tot, gmx_constr *constr,
gmx_signalling_t *gs, gmx_bool bInterSimGS,
gmx::SimulationSignaller *signalCoordinator,
matrix box, int *totalNumberOfBondedInteractions,
gmx_bool *bSumEkinhOld, int flags);
/* Compute global variables during integration */
......
......@@ -3,7 +3,7 @@
*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2004, The GROMACS development team.
* Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
* Copyright (c) 2013,2014,2015,2016, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
......@@ -36,11 +36,15 @@
*/
/*! \internal \file
*
* \brief This file defines functions for inter-rank signalling by mdrun.
* \brief This file defines functions for inter- and intra-simulation
* signalling by mdrun.
*
* This handles details of responding to termination conditions,
* coordinating checkpoints, and coordinating multi-simulations.
*
* \todo Move this to mdrunutility module alongside gathering
* multi-simulation communication infrastructure there.
*
* \author Berk Hess <hess@kth.se>
* \author Mark Abraham <mark.j.abraham@gmail.com>
* \ingroup module_mdlib
......@@ -48,54 +52,38 @@
#include "gmxpre.h"
#include "mdrun_signalling.h"
#include "simulationsignal.h"
#include <algorithm>
#include "gromacs/gmxlib/network.h"
#include "gromacs/mdlib/md_support.h"
#include "gromacs/mdtypes/commrec.h"
#include "gromacs/mdtypes/inputrec.h"
#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/real.h"
void init_global_signals(struct gmx_signalling_t *gs, const t_commrec *cr,
const t_inputrec *ir, int repl_ex_nst)
namespace gmx
{
int i;
if (MULTISIM(cr))
{
gs->nstms = multisim_nstsimsync(cr, ir, repl_ex_nst);
if (debug)
{
fprintf(debug, "Syncing simulations for checkpointing and termination every %d steps\n", gs->nstms);
}
}
else
{
gs->nstms = 1;
}
for (i = 0; i < eglsNR; i++)
{
gs->sig[i] = 0;
gs->set[i] = 0;
}
}
SimulationSignaller::SimulationSignaller(SimulationSignals *signals,
const t_commrec *cr,
bool doInterSim,
bool doIntraSim)
: signals_(signals), cr_(cr),
doInterSim_(doInterSim),
doIntraSim_(doInterSim || doIntraSim),
mpiBuffer_ {}
{}
gmx::ArrayRef<real>
prepareSignalBuffer(struct gmx_signalling_t *gs)
SimulationSignaller::getCommunicationBuffer()
{
if (gs)
if (doIntraSim_)
{
gmx::ArrayRef<signed char> sig(gs->sig);
gmx::ArrayRef<real> temp(gs->mpiBuffer);
std::copy(sig.begin(), sig.end(), temp.begin());
std::transform(std::begin(*signals_), std::end(*signals_), std::begin(mpiBuffer_),
[](const SimulationSignals::value_type &s) { return s.sig; });
return temp;
return gmx::ArrayRef<real>::fromArray(mpiBuffer_.data(), mpiBuffer_.size());
}
else
{
......@@ -104,44 +92,59 @@ prepareSignalBuffer(struct gmx_signalling_t *gs)
}
void
handleSignals(struct gmx_signalling_t *gs,
const t_commrec *cr,
bool bInterSimGS)
SimulationSignaller::signalInterSim()
{
/* Is the signal in one simulation independent of other simulations? */
bool bIsSignalLocal[eglsNR] = { false, false, true };
if (!gs)
if (!doInterSim_)
{
return;
}
// The situations that lead to doInterSim_ == true without a
// multi-simulation begin active should already have issued an
// error at mdrun time in release mode, so there's no need for a
// release-mode assertion.
GMX_ASSERT(MULTISIM(cr_) != nullptr, "Cannot do inter-simulation signalling without a multi-simulation");
if (MASTER(cr_))
{
// Communicate the signals between the simulations.
gmx_sum_sim(eglsNR, mpiBuffer_.data(), cr_->ms);
}
// Communicate the signals from the master to the others.
gmx_bcast(eglsNR*sizeof(mpiBuffer_[0]), mpiBuffer_.data(), cr_);
}
if (MULTISIM(cr) && bInterSimGS)
void SimulationSignaller::setSignals()
{
if (!doIntraSim_)
{
if (MASTER(cr))
{
/* Communicate the signals between the simulations */
gmx_sum_sim(eglsNR, gs->mpiBuffer, cr->ms);
}
/* Communicate the signals from the master to the others */
gmx_bcast(eglsNR*sizeof(gs->mpiBuffer[0]), gs->mpiBuffer, cr);
return;
}
for (int i = 0; i < eglsNR; i++)
SimulationSignals &s = *signals_;
for (size_t i = 0; i < s.size(); i++)
{
if (bInterSimGS || bIsSignalLocal[i])
if (doInterSim_ || s[i].isLocal)
{
// Floating-point reduction of integer values is always
// exact, so we can use a simple cast.
signed char gsi = static_cast<signed char>(mpiBuffer_[i]);
/* Set the communicated signal only when it is non-zero,
* since signals might not be processed at each MD step.
*/
signed char gsi = (gs->mpiBuffer[i] >= 0.0 ?