Commit efd60b25 authored by David Hendriks's avatar David Hendriks
Browse files

updated code in grid and fixing bugs and commented out some code that isnt working yet

parent 581cd002
Loading
Loading
Loading
Loading
+3 −1
Original line number Diff line number Diff line
@@ -19,8 +19,10 @@ def autogen_C_logging_code(logging_dict: dict, verbose: int = 0) -> Optional[str
    Input is a dictionary where the key is the header of that logging line
    and items which are lists of parameters that will be put in that logging line

    The list elements are all appended to 'stardata->' in the autogenerated code.

    Example:
        input dictionary should look like this::
        Input dictionary should look like this::

            {'MY_STELLAR_DATA':
                [
+527 −474

File changed.

Preview size limit exceeded, changes collapsed.

+36 −34
Original line number Diff line number Diff line
@@ -129,43 +129,45 @@ grid_options_defaults_dict = {
    # Slurm stuff
    ########################################
    "slurm": 0,  # dont use the slurm by default. 1 = use slurm
    "slurm_ntasks": 1,  # CPUs required per array job: usually only need this
    "slurm_command": "",  # Command that slurm runs (e.g. evolve or join_datafiles)
    "slurm_dir": "",  # working directory containing scripts output logs etc.
    "slurm_njobs": 0,  # number of scripts; set to 0 as default
    "slurm_jobid": "",  # slurm job id (%A)
    "slurm_memory": 512,  # in MB, the memory use of the job
    "slurm_warn_max_memory": 1024,  # in MB : warn if mem req. > this
    "slurm_use_all_node_CPUs": 0,  # 1 = use all of a node's CPUs. 0 = use a given amount of CPUs
    "slurm_postpone_join": 0,  # if 1 do not join on slurm, join elsewhere. want to do it off the slurm grid (e.g. with more RAM)
    "slurm_jobarrayindex": "",  # slurm job array index (%a)
    "slurm_jobname": "binary_grid",  # default
    "slurm_partition": None,
    "slurm_time": 0,  # total time. 0 = infinite time
    "slurm_postpone_sbatch": 0,  # if 1: don't submit, just make the script
    "slurm_array": None,  # override for --array, useful for rerunning jobs
    "slurm_use_all_node_CPUs": 0,  # if given nodes, set to 1
    # if given CPUs, set to 0
    # you will want to use this if your Slurm SelectType is e.g. linear
    # which means it allocates all the CPUs in a node to the job
    "slurm_control_CPUs": 0,  # if so, leave this many for Pythons control (0)
    "slurm_array": None,  # override for --array, useful for rerunning jobs
    "slurm_partition": None,  # MUST be defined
    "slurm_extra_settings": {},  # Place to put extra configuration for the SLURM batch file. The key and value of the dict will become the key and value of the line in te slurm batch file. Will be put in after all the other settings (and before the command). Take care not to overwrite something without really meaning to do so.
    # "slurm_ntasks": 1,  # CPUs required per array job: usually only need this
    # "slurm_command": "",  # Command that slurm runs (e.g. evolve or join_datafiles)
    # "slurm_dir": "",  # working directory containing scripts output logs etc.
    # "slurm_njobs": 0,  # number of scripts; set to 0 as default
    # "slurm_jobid": "",  # slurm job id (%A)
    # "slurm_memory": 512,  # in MB, the memory use of the job
    # "slurm_warn_max_memory": 1024,  # in MB : warn if mem req. > this
    # "slurm_use_all_node_CPUs": 0,  # 1 = use all of a node's CPUs. 0 = use a given amount of CPUs
    # "slurm_postpone_join": 0,  # if 1 do not join on slurm, join elsewhere. want to do it off the slurm grid (e.g. with more RAM)
    # "slurm_jobarrayindex": "",  # slurm job array index (%a)
    # "slurm_jobname": "binary_grid",  # default
    # "slurm_partition": None,
    # "slurm_time": 0,  # total time. 0 = infinite time
    # "slurm_postpone_sbatch": 0,  # if 1: don't submit, just make the script
    # "slurm_array": None,  # override for --array, useful for rerunning jobs
    # "slurm_use_all_node_CPUs": 0,  # if given nodes, set to 1
    # # if given CPUs, set to 0
    # # you will want to use this if your Slurm SelectType is e.g. linear
    # # which means it allocates all the CPUs in a node to the job
    # "slurm_control_CPUs": 0,  # if so, leave this many for Pythons control (0)
    # "slurm_array": None,  # override for --array, useful for rerunning jobs
    # "slurm_partition": None,  # MUST be defined
    # "slurm_extra_settings": {},  # Place to put extra configuration for the SLURM batch file. The key and value of the dict will become the key and value of the line in te slurm batch file. Will be put in after all the other settings (and before the command). Take care not to overwrite something without really meaning to do so.
    ########################################
    # Condor stuff
    ########################################
    "condor": 0,  # 1 to use condor, 0 otherwise
    "condor_command": "",  # condor command e.g. "evolve", "join"
    "condor_dir": "",  # working directory containing e.g. scripts, output, logs (e.g. should be NFS available to all)
    "condor_njobs": "",  # number of scripts/jobs that CONDOR will run in total
    "condor_jobid": "",  # condor job id
    "condor_postpone_join": 0,  # if 1, data is not joined, e.g. if you want to do it off the condor grid (e.g. with more RAM)
    # "condor_join_machine": None, # if defined then this is the machine on which the join command should be launched (must be sshable and not postponed)
    "condor_join_pwd": "",  # directory the join should be in (defaults to $ENV{PWD} if undef)
    "condor_memory": 1024,  # in MB, the memory use (ImageSize) of the job
    "condor_universe": "vanilla",  # usually vanilla universe
    "condor_extra_settings": {},  # Place to put extra configuration for the CONDOR submit file. The key and value of the dict will become the key and value of the line in te slurm batch file. Will be put in after all the other settings (and before the command). Take care not to overwrite something without really meaning to do so.
    # "condor_command": "",  # condor command e.g. "evolve", "join"
    # "condor_dir": "",  # working directory containing e.g. scripts, output, logs (e.g. should be NFS available to all)
    # "condor_njobs": "",  # number of scripts/jobs that CONDOR will run in total
    # "condor_jobid": "",  # condor job id
    # "condor_postpone_join": 0,  # if 1, data is not joined, e.g. if you want to do it off the condor grid (e.g. with more RAM)
    # # "condor_join_machine": None, # if defined then this is the machine on which the join command should be launched (must be sshable and not postponed)
    # "condor_join_pwd": "",  # directory the join should be in (defaults to $ENV{PWD} if undef)
    # "condor_memory": 1024,  # in MB, the memory use (ImageSize) of the job
    # "condor_universe": "vanilla",  # usually vanilla universe
    # "condor_extra_settings": {},  # Place to put extra configuration for the CONDOR submit file. The key and value of the dict will become the key and value of the line in te slurm batch file. Will be put in after all the other settings (and before the command). Take care not to overwrite something without really meaning to do so.


    # snapshots and checkpoints
    # condor_snapshot_on_kill=>0, # if 1 snapshot on SIGKILL before exit
    # condor_load_from_snapshot=>0, # if 1 check for snapshot .sv file and load it if found
@@ -456,7 +458,7 @@ grid_options_descriptions = {
    "_store_memaddr": "Memory adress of the store object for binary_c.",
    "failed_systems_threshold": "Variable storing the maximum amount of systems that are allowed to fail before logging their commandline arguments to failed_systems log files",
    "parse_function": "Function that the user can provide to handle the output the binary_c. This function has to take the arguments (self, output). Its best not to return anything in this function, and just store stuff in the grid_options['results'] dictionary, or just output results to a file",
    "condor": "Int flag whether to use a condor type population evolution.",  # TODO: describe this in more detail
    "condor": "Int flag whether to use a condor type population evolution. Not implemented yet.",  # TODO: describe this in more detail
    "slurm": "Int flag whether to use a slurm type population evolution.",  # TODO: describe this in more detail
    "weight": "Weight factor for each system. The calculated probability is mulitplied by this. If the user wants each system to be repeated several times, then this variable should not be changed, rather change the _repeat variable instead, as that handles the reduction in probability per system. This is useful for systems that have a process with some random element in it.",  # TODO: add more info here, regarding the evolution splitting.
    "repeat": "Factor of how many times a system should be repeated. Consider the evolution splitting binary_c argument for supernovae kick repeating.",  # TODO: make sure this is used.
+131 −131
Original line number Diff line number Diff line
"""
File containing functions for HPC computing, distributed tasks on clusters etc.

Functions that the slurm and condor subroutines of the population object use.

Mainly divided in 2 sections: Slurm and Condor
"""

import os
import sys
import time
import subprocess
from typing import Union
import __main__ as main


def get_slurm_version() -> Union[str, None]:
    """
    Function that checks whether slurm is installed and returns the version if its installed.

    Only tested this with slurm v17+

    Returns:
        slurm version, or None
    """

    slurm_version = None

    try:
        slurm_version = (
            subprocess.run(["sinfo", "-V"], stdout=subprocess.PIPE, check=True)
            .stdout.decode("utf-8")
            .split()
        )[1]
    except FileNotFoundError as err:
        print(err)
        print(err.args)
        print("Slurm is not installed or not loaded")
    except Exception as err:
        print(err)
        print(err.args)
        print("Unknown error, contact me about this")
# """
# File containing functions for HPC computing, distributed tasks on clusters etc.

# Functions that the slurm and condor subroutines of the population object use.

# Mainly divided in 2 sections: Slurm and Condor
# """

# import os
# import sys
# import time
# import subprocess
# from typing import Union
# import __main__ as main


# def get_slurm_version() -> Union[str, None]:
#     """
#     Function that checks whether slurm is installed and returns the version if its installed.

#     Only tested this with slurm v17+

#     Returns:
#         slurm version, or None
#     """

#     slurm_version = None

#     try:
#         slurm_version = (
#             subprocess.run(["sinfo", "-V"], stdout=subprocess.PIPE, check=True)
#             .stdout.decode("utf-8")
#             .split()
#         )[1]
#     except FileNotFoundError as err:
#         print(err)
#         print(err.args)
#         print("Slurm is not installed or not loaded")
#     except Exception as err:
#         print(err)
#         print(err.args)
#         print("Unknown error, contact me about this")

    return slurm_version
#     return slurm_version


def get_condor_version() -> Union[str, None]:
    """
    Function that checks whether slurm is installed and returns the version if its installed.
# def get_condor_version() -> Union[str, None]:
#     """
#     Function that checks whether slurm is installed and returns the version if its installed.

    otherwise returns None
#     otherwise returns None

    Result has to be condor v8 or higher
#     Result has to be condor v8 or higher

    Returns:
        condor version, or None
    """
#     Returns:
#         condor version, or None
#     """

    condor_version = None
#     condor_version = None

    try:
        condor_version = (
            subprocess.run(
                ["condor_q", "--version"], stdout=subprocess.PIPE, check=True
            )
            .stdout.decode("utf-8")
            .split()
        )[1]
    except FileNotFoundError as err:
        print("Slurm is not installed or not loaded: ")
        print(err)
        print(err.args)
    except Exception as err:
        print("Unknown error, contact me about this: ")
        print(err)
        print(err.args)
#     try:
#         condor_version = (
#             subprocess.run(
#                 ["condor_q", "--version"], stdout=subprocess.PIPE, check=True
#             )
#             .stdout.decode("utf-8")
#             .split()
#         )[1]
#     except FileNotFoundError as err:
#         print("Slurm is not installed or not loaded: ")
#         print(err)
#         print(err.args)
#     except Exception as err:
#         print("Unknown error, contact me about this: ")
#         print(err)
#         print(err.args)

    return condor_version
#     return condor_version


def create_directories_hpc(working_dir: str) -> None:
    """
    Function to create a set of directories, given a root directory
# def create_directories_hpc(working_dir: str) -> None:
#     """
#     Function to create a set of directories, given a root directory

    These directories will contain stuff for the HPC runs
#     These directories will contain stuff for the HPC runs

    Args:
        working_dir: main working directory of the run. Under this directory all the dirs will be created
    """
#     Args:
#         working_dir: main working directory of the run. Under this directory all the dirs will be created
#     """

    # Check if working_dir exists
    if not os.path.isdir(working_dir):
        print("Error. Working directory {} does not exist! Aborting")
        raise ValueError
#     # Check if working_dir exists
#     if not os.path.isdir(working_dir):
#         print("Error. Working directory {} does not exist! Aborting")
#         raise ValueError

    directories_list = [
        "scripts",
        "stdout",
        "stderr",
        "results",
        "logs",
        "status",
        "joining",
    ]
#     directories_list = [
#         "scripts",
#         "stdout",
#         "stderr",
#         "results",
#         "logs",
#         "status",
#         "joining",
#     ]

    # Make directories.
    for subdir in directories_list:
        full_path = os.path.join(working_dir, subdir)
        os.makedirs(full_path, exist_ok=True)

    # Since the directories are probably made on some mount which has to go over NFS
    # we should explicitly check if they are created
    print("Checking if creating the directories has finished...")
    directories_exist = False
    while directories_exist:
        directories_exist = True

        for subdir in directories_list:
            full_path = os.path.join(working_dir, subdir)

            if not os.path.isdir(full_path):
                time.sleep(1)
                directories_exist = False
    print("..Finished! Directories exist.")


def path_of_calling_script() -> str:
    """
    Function to get the name of the script the user executes.
    TODO: fix this function. seems not to work properly.
    """

    return main.__file__


def get_python_details() -> dict:
    """
    Function to get some info about the used python version and virtualenv etc
#     # Make directories.
#     for subdir in directories_list:
#         full_path = os.path.join(working_dir, subdir)
#         os.makedirs(full_path, exist_ok=True)

#     # Since the directories are probably made on some mount which has to go over NFS
#     # we should explicitly check if they are created
#     print("Checking if creating the directories has finished...")
#     directories_exist = False
#     while directories_exist:
#         directories_exist = True

#         for subdir in directories_list:
#             full_path = os.path.join(working_dir, subdir)

#             if not os.path.isdir(full_path):
#                 time.sleep(1)
#                 directories_exist = False
#     print("..Finished! Directories exist.")


# def path_of_calling_script() -> str:
#     """
#     Function to get the name of the script the user executes.
#     TODO: fix this function. seems not to work properly.
#     """

#     return main.__file__


# def get_python_details() -> dict:
#     """
#     Function to get some info about the used python version and virtualenv etc

    Returns:
        dictionary with python executable, virtual environment and version information.
    """
#     Returns:
#         dictionary with python executable, virtual environment and version information.
#     """

    python_info_dict = {}
#     python_info_dict = {}

    #
    python_info_dict["virtualenv"] = os.getenv("VIRTUAL_ENV")
    python_info_dict["executable"] = sys.executable
    python_info_dict["version"] = sys.version
#     #
#     python_info_dict["virtualenv"] = os.getenv("VIRTUAL_ENV")
#     python_info_dict["executable"] = sys.executable
#     python_info_dict["version"] = sys.version

    return python_info_dict
#     return python_info_dict