Commit 07d12095 authored by Erik Lindahl's avatar Erik Lindahl
Browse files

Prevent fragile use cases of checkpoint appending

There are way too many ways we allow runs to be continued
and extended. We still allow the checkpoint file to be
missing (so -cpi can be used for all command lines), but
we warn if it is not found. To avoid mistakes with file
appending when restarting from checkpoints, we now require
that all previous output files must be present
(unless -noappend is used), and that the file names must
match the ones used in the previous run.

Fixes #1777.

Change-Id: Id9e89773a4a9214be6dbb76676c526e98e12bd37
parent 14ed2ccd
/*
* This file is part of the GROMACS molecular simulation package.
*
* Copyright (c) 2015, by the GROMACS development team, led by
* Copyright (c) 2015,2016, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
......@@ -91,10 +91,10 @@ static gmx_bool exist_output_file(const char *fnm_cp, int nfile, const t_filenm
* This is is needed at the beginning of mdrun,
* to be able to rename the logfile correctly.
* When file appending is requested, checks which output files are present,
* and returns TRUE/FALSE in bDoAppendFiles if all or none are present.
* If only some output files are present, give a fatal error.
* When bDoAppendFiles is TRUE upon return, bAddPart will tell whether the simulation part
* needs to be added to the output file name.
* and issue a fatal error if some are not.
* Upon return, bAddPart will tell whether the simulation part
* needs to be added to the output file name, i.e. when we are doing checkpoint
* continuation without appending.
*
* This routine cannot print tons of data, since it is called before
* the log file is opened. */
......@@ -120,6 +120,7 @@ read_checkpoint_data(const char *filename, int *simulation_part,
if (!gmx_fexist(filename) || (!(fp = gmx_fio_open(filename, "r")) ))
{
*simulation_part = 0;
fprintf(stderr, "Warning: No checkpoint file found with -cpi option. Assuming this is a new run.\n\n");
}
else
{
......@@ -142,35 +143,48 @@ read_checkpoint_data(const char *filename, int *simulation_part,
{
*bDoAppendFiles = bTryToAppendFiles;
}
else if (nexist > 0)
else
{
// If we get here, the user requested restarting from a checkpoint file, that checkpoint
// file was found (so it is not the first part of a new run), but we are still missing
// some or all checkpoint files. In this case we issue a fatal error since there are
// so many special cases we cannot keep track of, and better safe than sorry.
fprintf(stderr,
"Output file appending has been requested,\n"
"but some output files listed in the checkpoint file %s\n"
"are not present or are named differently by the current program:\n",
"are not present or not named as the output files by the current program:\n",
filename);
fprintf(stderr, "output files present:");
fprintf(stderr, "Expect output files present:\n");
for (f = 0; f < nfiles; f++)
{
if (exist_output_file(outputfiles[f].filename,
nfile, fnm))
{
fprintf(stderr, " %s", outputfiles[f].filename);
fprintf(stderr, " %s\n", outputfiles[f].filename);
}
}
fprintf(stderr, "\n");
fprintf(stderr, "output files not present or named differently:");
fprintf(stderr, "Expected output files not present or named differently:\n");
for (f = 0; f < nfiles; f++)
{
if (!exist_output_file(outputfiles[f].filename,
nfile, fnm))
{
fprintf(stderr, " %s", outputfiles[f].filename);
fprintf(stderr, " %s\n", outputfiles[f].filename);
}
}
fprintf(stderr, "\n");
gmx_fatal(FARGS, "File appending requested, but %d of the %d output files are not present or are named differently", nfiles-nexist, nfiles);
gmx_fatal(FARGS,
"File appending requested, but %d of the %d output files are not present or are named differently. "
"For safety reasons, GROMACS-2016 and later only allows file appending to be used when all files "
"have the same names as they had in the original run. "
"Checkpointing is merely intended for plain continuation of runs. "
"For safety reasons you must specify all file names (e.g. with -deffnm), "
"and all these files must match the names used in the run prior to checkpointing "
"since we will append to them by default. If the files are not available, you "
"can add the -noappend flag to mdrun and write separate new parts. "
"For mere concatenation of files, you should use the gmx trjcat tool instead.",
nfiles-nexist, nfiles);
}
}
......@@ -200,6 +214,7 @@ read_checkpoint_data(const char *filename, int *simulation_part,
}
if (PAR(cr))
{
// Make sure all settings are in sync
gmx_bcast(sizeof(*simulation_part), simulation_part, cr);
if (*simulation_part > 0 && bTryToAppendFiles)
......@@ -224,18 +239,18 @@ handleRestart(t_commrec *cr,
const char *part_suffix = ".part";
FILE *fpmulti;
bAddPart = !bTryToAppendFiles;
/* Check if there is ANY checkpoint file available */
sim_part = 1;
sim_part_fn = sim_part;
if (opt2bSet("-cpi", NFILE, fnm))
{
bAddPart = !bTryToAppendFiles;
read_checkpoint_data(opt2fn_master("-cpi", NFILE, fnm, cr),
&sim_part_fn, cr,
bTryToAppendFiles, NFILE, fnm,
part_suffix, &bAddPart,
bDoAppendFiles);
part_suffix, &bAddPart, bDoAppendFiles);
if (sim_part_fn == 0 && MULTIMASTER(cr))
{
fprintf(stdout, "No previous checkpoint file present, assuming this is a new run.\n");
......@@ -262,8 +277,10 @@ handleRestart(t_commrec *cr,
}
else
{
*bDoAppendFiles = FALSE;
*bDoAppendFiles = false;
bAddPart = false;
}
*bStartFromCpt = sim_part > 1;
if (!*bDoAppendFiles)
......
/*
* This file is part of the GROMACS molecular simulation package.
*
* Copyright (c) 2015, by the GROMACS development team, led by
* Copyright (c) 2015,2016, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
......@@ -66,21 +66,29 @@ struct t_filenm;
/*! \brief Handle startup of mdrun, particularly regarding -cpi and -append
*
* If there is a checkpoint file, then prepare to start from that
* state. If there is also appending, then organize the file naming
* accordingly, if possible. Issues various fatal errors if the input
* conditions are inconsistent or broken. \p fnm is updated with
* suffix strings for part numbers if we are doing a restart from
* checkpoint and are not appending.
* state. If restarting from a checkpoint file and appending is requested with
* tryToAppendFiles, we will set doAppendFiles to true on return if all files
* were found correctly. If some files are not found when appending should be
* done, we will instead issue a fatal error to avoid unintentional problems.
*
* Does communication to coordinate behaviour between all ranks of a
* simulation, and/or simulations.
* If there is no checkpoint file, we assume it is the first part of a new run,
* and in this case we silently set doAppendFiles to false on return.
*
* On return, \p fnm is updated with suffix strings for part numbers if we are
* doing a restart from checkpoint and are not appending. The routine also does
* communication to coordinate behaviour between all ranks of a simulation,
* and/or simulations.
*
* \param[in] cr Communication structure
* \param[in] bTryToAppendFiles Whether mdrun -append was used
* \param[in] bTryToAppendFiles Whether appending is requested (from mdrun)
* \param[in] NFILE Size of fnm struct
* \param[inout] fnm Filename parameters to mdrun
* \param[out] bDoAppendFiles Whether mdrun will append to files
* \param[out] bStartFromCpt Whether mdrun will start from the -cpi file
* \param[out] bDoAppendFiles True on return if we will do appending.
* Note that the routine will generate a fatal
* error for some scenarios where appending is
* requested but the necessary files not found.
* \param[out] bStartFromCpt True on return if we found the checkpoint
* and will use it to restart.
*/
void handleRestart(t_commrec *cr,
gmx_bool bTryToAppendFiles,
......
......@@ -488,8 +488,23 @@ int gmx_mdrun(int argc, char *argv[])
#endif
}
handleRestart(cr, bTryToAppendFiles, NFILE, fnm,
&bDoAppendFiles, &bStartFromCpt);
if (!opt2bSet("-cpi", NFILE, fnm))
{
// If we are not starting from a checkpoint we never allow files to be appended
// to, since that has caused a ton of strange behaviour and bugs in the past.
if (opt2parg_bSet("-append", asize(pa), pa))
{
// If the user explicitly used the -append option, explain that it is not possible.
gmx_fatal(FARGS, "GROMACS can only append to files when restarting from a checkpoint.");
}
else
{
// If the user did not say anything explicit, just disable appending.
bTryToAppendFiles = FALSE;
}
}
handleRestart(cr, bTryToAppendFiles, NFILE, fnm, &bDoAppendFiles, &bStartFromCpt);
Flags = opt2bSet("-rerun", NFILE, fnm) ? MD_RERUN : 0;
Flags = Flags | (bDDBondCheck ? MD_DDBONDCHECK : 0);
......@@ -498,7 +513,7 @@ int gmx_mdrun(int argc, char *argv[])
Flags = Flags | (bConfout ? MD_CONFOUT : 0);
Flags = Flags | (bRerunVSite ? MD_RERUN_VSITE : 0);
Flags = Flags | (bReproducible ? MD_REPRODUCIBLE : 0);
Flags = Flags | (bDoAppendFiles ? MD_APPENDFILES : 0);
Flags = Flags | (bDoAppendFiles ? MD_APPENDFILES : 0);
Flags = Flags | (opt2parg_bSet("-append", asize(pa), pa) ? MD_APPENDFILESSET : 0);
Flags = Flags | (bKeepAndNumCPT ? MD_KEEPANDNUMCPT : 0);
Flags = Flags | (bStartFromCpt ? MD_STARTFROMCPT : 0);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment