Commit 078252e0 by Mathieu

BUSCO 3.0.1

parent 2f3dd7b4
No preview for this file type
3.0.1
- Add the environment variable BUSCO_CONFIG_FILE
- Add the –blast_single_core option
- Incompatibilities between plotting tool and config fixed
3.0.0
- Major refactoring of the code
- Introduce setup.py
......
**BUSCO - Benchmarking sets of Universal Single-Copy Orthologs.**
To install, ``sudo python setup.py install`` or ``python setup.py install --user``
To get help, ``python scripts/run_BUSCO.py -h`` and ``python scripts/generate_plot.py -h``
Do not forget to create a ``config.ini`` file in the ``config/`` subfolder. You can set the ``BUSCO_CONFIG_FILE``
environment variable to define a custom path (including the filename) to the ``config.ini`` file,
useful for switching between configurations or in a multi-users environment.
See also the user guide: BUSCO_v3_userguide.pdf
You can download BUSCO datasets on http://busco.ezlab.org
......
## BUSCO specific configuration
## It overrides default values in code and dataset cfg, and is overridden by arguments in command line
## Uncomment lines with single # when appropriate
# BUSCO specific configuration
# It overrides default values in code and dataset cfg, and is overridden by arguments in command line
# Uncomment lines when appropriate
[busco]
## Input file
# in = ./sample_data/target.fa
## Run name, used in output files and folder
# out = SAMPLE
## Where to store the output directory
# out_path = ./sample_data
## Path to the BUSCO dataset
# lineage_path = ./sample_data/example
## Which mode to run (genome / protein / transcriptome)
# mode = genome
## How many threads to use for multithreaded steps
# cpu = 1
## Domain for augustus retraining, eukaryota or prokaryota
# domain = eukaryota # do not change this unless you know exaclty why !!!
## Force rewrite if files already exist (True/False)
# force = False
## Restart mode (True/False)
# restart = False
## Blast e-value
# evalue = 1e-3
## Species to use with augustus, for old datasets only
# species = fly
## Augustus extra parameters
# augustus_parameters = '' # nothing here, use single quotes, like this: '--param1=1 --param2=2'
## Tmp folder
# tmp_path = ./tmp/
## How many candidate regions (contigs, scaffolds) to consider for each BUSCO
# limit = 3
## Augustus long mode for retraining (True/False)
# long = False
## Quiet mode (True/False)
# quiet = False
## Debug logs (True/False), it needs Quiet to be False
# debug = True
## tar gzip output files (True/False)
# gzip = False
# Input file
;in = ./sample_data/target.fa
# Run name, used in output files and folder
;out = SAMPLE
# Where to store the output directory
;out_path = ./sample_data
# Path to the BUSCO dataset
;lineage_path = ./sample_data/example
# Which mode to run (genome / protein / transcriptome)
;mode = genome
# How many threads to use for multithreaded steps
;cpu = 1
# Domain for augustus retraining, eukaryota or prokaryota
# Do not change this unless you know exactly why !!!
;domain = eukaryota
# Force rewrite if files already exist (True/False)
;force = False
# Restart mode (True/False)
;restart = False
# Blast e-value
;evalue = 1e-3
# Species to use with augustus, for old datasets only
;species = fly
# Augustus extra parameters
# Use single quotes, like this: '--param1=1 --param2=2'
;augustus_parameters = ''
# Tmp folder
;tmp_path = ./tmp/
# How many candidate regions (contigs, scaffolds) to consider for each BUSCO
;limit = 3
# Augustus long mode for retraining (True/False)
;long = False
# Quiet mode (True/False)
;quiet = False
# Debug logs (True/False), it needs Quiet to be False
;debug = True
# tar gzip output files (True/False)
;gzip = False
# Force single core for the tblastn step
;blast_single_core = True
[tblastn]
## path to tblastn
# path to tblastn
path = /usr/bin/
[makeblastdb]
## path to makeblastdb
# path to makeblastdb
path = /usr/bin/
[augustus]
## path to augustus
# path to augustus
path = /home/osboxes/BUSCOVM/augustus/augustus-3.2.2/bin/
[etraining]
## path to augustus etraining
# path to augustus etraining
path = /home/osboxes/BUSCOVM/augustus/augustus-3.2.2/bin/
## path to augustus perl scripts, redeclare it for each new script
# path to augustus perl scripts, redeclare it for each new script
[gff2gbSmallDNA.pl]
path = /home/osboxes/BUSCOVM/augustus/augustus-3.2.2/scripts/
[new_species.pl]
......@@ -64,9 +68,9 @@ path = /home/osboxes/BUSCOVM/augustus/augustus-3.2.2/scripts/
path = /home/osboxes/BUSCOVM/augustus/augustus-3.2.2/scripts/
[hmmsearch]
## path to HMMsearch executable
# path to HMMsearch executable
path = /home/osboxes/BUSCOVM/hmmer/hmmer-3.1b2-linux-intel-ia32/binaries/
[Rscript]
## path to Rscript, if you wish to use the plot tool
# path to Rscript, if you wish to use the plot tool
path = /usr/bin/
......@@ -5,7 +5,7 @@
.. module:: generate_plot
:synopsis: This module produces a graphic summary for BUSCO runs based on short summary files
.. versionadded:: 2.0.0
.. versionchanged:: 3.0.0
.. versionchanged:: 3.0.1
This module produces a graphic summary for BUSCO runs based on short summary files
......@@ -211,7 +211,10 @@ def _run_r_code():
return None # do not run the code, but no need to stop the execution
# run R
config = BuscoConfig('%s/../config/config.ini' % os.path.dirname(os.path.realpath(__file__)), {}, False)
if os.environ.get('BUSCO_CONFIG_FILE') and os.access(os.environ.get('BUSCO_CONFIG_FILE'), os.R_OK):
config = BuscoConfig(os.environ.get('BUSCO_CONFIG_FILE'), {}, False)
else:
config = BuscoConfig('%s/../config/config.ini' % os.path.dirname(os.path.realpath(__file__)), {}, False)
try:
if Tool.check_tool_available('Rscript', config):
r_script = Tool('Rscript', config)
......@@ -299,7 +302,7 @@ def _load_data():
frag_pc = round(frag/float(total)*100, 1)
miss_pc = round(100 - comp_pc - dupl_pc - frag_pc, 1)
data['percentages'] += [comp_pc, dupl_pc, frag_pc, miss_pc]
_logger.info('Loaded %s sucessfully' % f)
_logger.info('Loaded %s successfully' % f)
except IOError:
_logger.warning('Impossible to use the file %s' % f)
if len(data['species']) == 0:
......
......@@ -4,7 +4,7 @@
.. module:: run_BUSCO
:synopsis: BUSCO - Benchmarking Universal Single-Copy Orthologs.
.. versionadded:: 3.0.0
.. versionchanged:: 3.0.0
.. versionchanged:: 3.0.1
This is the BUSCO main script.
......@@ -119,6 +119,12 @@ def _parse_args():
'contain thousands of files',
action="store_true")
optional.add_argument(
'--blast_single_core', dest='blast_single_core', required=False,
help='Force tblastn to run on a single core and ignore the --cpu argument for this step only. '
'Useful if inconsistencies when using multiple threads are noticed',
action="store_true")
optional.add_argument('-v', '--version', action='version', help="Show this version and exit",
version='BUSCO %s' % BuscoConfig.VERSION)
......@@ -137,7 +143,11 @@ def main():
start_time = time.time()
# 1) Load a busco config file that will figure out all the params from all sources
# i.e. provided config file, dataset cfg, and user args
config = BuscoConfig('%s/../config/config.ini' % os.path.dirname(os.path.realpath(__file__)), _parse_args())
if os.environ.get('BUSCO_CONFIG_FILE') and os.access(os.environ.get('BUSCO_CONFIG_FILE'), os.R_OK):
config_file = os.environ.get('BUSCO_CONFIG_FILE')
else:
config_file = '%s/../config/config.ini' % os.path.dirname(os.path.realpath(__file__))
config = BuscoConfig(config_file, _parse_args())
# Define a logger, the config is passed to tell the logger if you required the quiet mode
logger = PipeLogger.get_logger(__name__, config)
......@@ -151,7 +161,7 @@ def main():
logger.info(
'****************** Start a BUSCO %s analysis, current time: %s **'
'****************' % (BuscoConfig.VERSION, time.strftime('%m/%d/%Y %H:%M:%S')))
logger.info('Configuration loaded from %s' % config_file)
# 2) Load the analysis, this will check the dependencies and return the appropriate analysis object
analysis = BuscoAnalysis.get_analysis(config)
......
......@@ -4,7 +4,7 @@
.. module:: BuscoAnalysis
:synopsis: BuscoAnalysis implements general BUSCO analysis specifics
.. versionadded:: 3.0.0
.. versionchanged:: 3.0.0
.. versionchanged:: 3.0.1
Copyright (c) 2016-2017, Evgeny Zdobnov (ez@ezlab.org)
Licensed under the MIT license. See LICENSE.md file.
......@@ -83,6 +83,7 @@ class BuscoAnalysis(Analysis):
self._long = config.getboolean('busco', 'long')
self._restart = config.getboolean('busco', 'restart')
self._cpus = config.getint('busco', 'cpu')
self._blast_single_core = config.getboolean('busco', 'blast_single_core')
self._sequences = config.get('busco', 'in')
self._lineage_path = config.get('busco', 'lineage_path')
self._lineage_name = config.get('busco', 'clade_name')
......@@ -180,7 +181,7 @@ class BuscoAnalysis(Analysis):
elif mode == 'proteins' or mode == 'prot':
return GeneSetAnalysis(config)
else:
BuscoAnalysis._logger.error('Unknown mode, use genome, transcriptome, or proteins')
BuscoAnalysis._logger.error('Unknown mode %s, use genome, transcriptome, or proteins', mode)
raise SystemExit
#
......@@ -864,7 +865,10 @@ class BuscoAnalysis(Analysis):
tblastn_job.add_parameter('-evalue')
tblastn_job.add_parameter(str(self._ev_cutoff))
tblastn_job.add_parameter('-num_threads')
tblastn_job.add_parameter(str(self._cpus))
if not self._blast_single_core:
tblastn_job.add_parameter(str(self._cpus))
else:
tblastn_job.add_parameter('1')
tblastn_job.add_parameter('-query')
tblastn_job.add_parameter(query_file)
tblastn_job.add_parameter('-db')
......
......@@ -4,7 +4,7 @@
.. module:: BuscoConfig
:synopsis: Load and combine all parameters provided to BUSCO through config file, dataset and command line
.. versionadded:: 3.0.0
.. versionchanged:: 3.0.0
.. versionchanged:: 3.0.1
Copyright (c) 2016-2017, Evgeny Zdobnov (ez@ezlab.org)
Licensed under the MIT license. See LICENSE.md file.
......@@ -53,20 +53,21 @@ class BuscoConfig(PipeConfig):
'dataset_creation_date': 'N/A',
'dataset_nb_buscos': 'N/A', 'dataset_nb_species': 'N/A', 'augustus_parameters': '',
'long': False, 'restart': False, 'quiet': False, 'debug': False, 'force': False,
'tarzip': False}
'tarzip': False, 'blast_single_core': False}
MANDATORY_USER_PROVIDED_PARAMS = ['in', 'out', 'lineage_path', 'mode']
_logger = PipeLogger.get_logger(__name__)
def __init__(self, conf_file, args, check_mandatory=True):
def __init__(self, conf_file, args, checks=True):
"""
:param conf_file: a path to a config.ini file
:type conf_file: str
:param args: key and values matching BUSCO parameters to override config.ini values
:type args: dict
:param check_mandatory: whether to require the mandatory parameters used in a main BUSCO analysis. Default True
:type check_mandatory: bool
:param checks: whether to proceed to the mandatory parameters + file dependencies checks,
used in a main BUSCO analysis. Default True
:type checks: bool
"""
try:
super(BuscoConfig, self).__init__(conf_file)
......@@ -96,7 +97,7 @@ class BuscoConfig(PipeConfig):
self.set('busco', key, 'True')
# Validate that all keys that are mandatory are there
if check_mandatory:
if checks:
for param in BuscoConfig.MANDATORY_USER_PROVIDED_PARAMS:
try:
self.get('busco', param)
......@@ -122,17 +123,19 @@ class BuscoConfig(PipeConfig):
elif l.split("=")[0] == "species":
try:
self.get('busco', 'species')
BuscoConfig._logger.warning('An augustus species is mentionned in the config file, '
'dataset default species (%s) will be ignored'
% l.strip().split("=")[1])
if checks:
BuscoConfig._logger.warning('An augustus species is mentioned in the config file, '
'dataset default species (%s) will be ignored'
% l.strip().split("=")[1])
except NoOptionError:
self.set('busco', 'species', l.strip().split("=")[1])
elif l.split("=")[0] == "domain":
try:
self.get('busco', 'domain')
BuscoConfig._logger.warning('A domain for augustus training is mentionned in the config '
'file, dataset default domain (%s) will be ignored'
% l.strip().split("=")[1])
if checks:
BuscoConfig._logger.warning('A domain for augustus training is mentioned in the config '
'file, dataset default domain (%s) will be ignored'
% l.strip().split("=")[1])
except NoOptionError:
self.set('busco', 'domain', l.strip().split("=")[1])
domain = l.strip().split("=")[1]
......@@ -142,15 +145,17 @@ class BuscoConfig(PipeConfig):
self.set('busco', 'dataset_nb_buscos', l.strip().split("=")[1])
elif l.split("=")[0] == "number_of_species":
self.set('busco', 'dataset_nb_species', l.strip().split("=")[1])
if domain != 'prokaryota' and domain != 'eukaryota':
if checks and domain != 'prokaryota' and domain != 'eukaryota':
BuscoConfig._logger.error(
'Corrupted dataset.cfg file: domain is %s, should be eukaryota or prokaryota' % domain)
raise SystemExit
except IOError:
BuscoConfig._logger.warning("The dataset you provided does not contain the file dataset.cfg, "
"likely because it is an old version. Default species (%s, %s) will be "
"used as augustus species" % (BuscoConfig.DEFAULT_ARGS_VALUES['species'],
BuscoConfig.DEFAULT_ARGS_VALUES['domain']))
if checks:
BuscoConfig._logger.warning("The dataset you provided does not contain the file dataset.cfg, "
"likely because it is an old version. Default species (%s, %s) will be "
"used as augustus species"
% (BuscoConfig.DEFAULT_ARGS_VALUES['species'],
BuscoConfig.DEFAULT_ARGS_VALUES['domain']))
# Fill the other with default values if not present
for param in list(BuscoConfig.DEFAULT_ARGS_VALUES.keys()):
......@@ -165,34 +170,39 @@ class BuscoConfig(PipeConfig):
self.set('busco', item[0], BuscoConfig.nice_path(item[1]))
# And check that in and lineage path and file actually exists
for item in self.items('busco'):
if item[0] == 'lineage_path' or item[0] == 'in':
BuscoConfig.check_path_exist(item[1])
if checks:
for item in self.items('busco'):
if item[0] == 'lineage_path' or item[0] == 'in':
BuscoConfig.check_path_exist(item[1])
# Prevent the user form using "~" as home
for item in self.items('busco'):
if item[0].endswith('_path'):
if item[1].startswith('~'):
BuscoConfig._logger.error('Do not use the \'~\' character as home in the config file, '
'use the full path instead')
raise SystemExit
if checks:
for item in self.items('busco'):
if item[0].endswith('_path'):
if item[1].startswith('~'):
BuscoConfig._logger.error('Do not use the \'~\' character as home in the config file, '
'use the full path instead')
raise SystemExit
# Prevent the user form using "/" in out name
if '/' in self.get('busco', 'out'):
BuscoConfig._logger.error('Please do not provide a full path in --out parameter, no slash.'
' Use out_path in the config.ini file to specify the full path.')
raise SystemExit
if checks:
if '/' in self.get('busco', 'out'):
BuscoConfig._logger.error('Please do not provide a full path in --out parameter, no slash.'
' Use out_path in the config.ini file to specify the full path.')
raise SystemExit
# Check the value of limit
if self.getint('busco', 'limit') == 0 or self.getint('busco', 'limit') > 20:
BuscoConfig._logger.error('Limit must be an integer between 1 and 20 (you have used: %s). '
'Note that this parameter is not needed by the protein mode.'
% self.getint('busco', 'limit'))
raise SystemExit
if checks:
if self.getint('busco', 'limit') == 0 or self.getint('busco', 'limit') > 20:
BuscoConfig._logger.error('Limit must be an integer between 1 and 20 (you have used: %s). '
'Note that this parameter is not needed by the protein mode.'
% self.getint('busco', 'limit'))
raise SystemExit
# Warn if custom evalue
if self.getfloat('busco', 'evalue') != BuscoConfig.DEFAULT_ARGS_VALUES['evalue']:
BuscoConfig._logger.warning('You are using a custom e-value cutoff')
if checks:
if self.getfloat('busco', 'evalue') != BuscoConfig.DEFAULT_ARGS_VALUES['evalue']:
BuscoConfig._logger.warning('You are using a custom e-value cutoff')
except NoSectionError:
BuscoConfig._logger.error('No section [busco] found in %s. Please make sure both the file and this section '
......
......@@ -6,4 +6,4 @@ Copyright (c) 2016-2017, Evgeny Zdobnov (ez@ezlab.org)
Licensed under the MIT license. See LICENSE.md file.
"""
__version__ = "3.0.0"
__version__ = "3.0.1"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment