Commit 0cdf3a05 authored by Jasper Koehorst's avatar Jasper Koehorst

Initial commit

parents
binaries
Databases
This document is intended to work on a workflow example from begin to end
### Obtain data from irods
export irodsHost=xxx
export irodsPort=xxx
export irodsUserName=xxx
export irodsZone=xxx
export irodsAuthScheme=password
export irodsHome=xxx
export irodsCwd=xxx
export irodsPassword=xxx
### Run workflow
Preserve environment is needed to have access to the exported variables
```
cwltool --preserve-entire-environment --tmp-outdir-prefix=./BLA/ --provenance PROVENANCE workflow_1.cwl --files /tempZone/Projects/P_IRODSRUNNER/I_Test_investigation/S_BABY_X/Amplicon/A_amp1bx/Unprocessed/G76494_R1_001.fastq.gz --files /tempZone/Projects/P_IRODSRUNNER/I_Test_investigation/S_BABY_X/Amplicon/A_amp1bx/Unprocessed/G76494_R2_001.fastq.gz --destination /tempZone/Projects/P_IRODSRUNNER/I_Test_investigation/S_BABY_X/Amplicon/A_amp1bx/Processed/HASH_MY2
```
### Run workflow within a docker on kubernetes or any other cloud instance which runs docker images
#### Step 1
Transmit arguments to the docker instance
- Path to yaml file + which cwl file to run according to metadata
- **/tempZone/Projects/P_IRODSRUNNER/I_Test_investigation/S_BABY_X/Amplicon/A_amp1bx/Unprocessed/testYAML.yml**
- **Metadata values:**
attribute: cwl
value: NGTAX_Workflow.cwl
units: waiting
#### Step 2
- Docker instance is started
- ``` docker run -it --entrypoint "" -v `pwd`:/test wurssb/unlock_base /bin/bash```
- Bash script or python script is executed which will download the yaml file
- Idea: python start.py -yml .... -cwl .... -args .... .... ....
- ```java -jar binaries/IRODSTransfer.jar -pull --files /tempZone/Projects/P_IRODSRUNNER/I_Test_investigation/S_BABY_X/Amplicon/A_amp1bx/Unprocessed/testYAML.yml```
executes the workflow
- ```cwltool --preserve-entire-environment --provenance PROVENANCE cwls/workflow_ngtax.cwl testYAML.yml```
Remove the data folder from the provenance?
- ```rm -r PROVENANCE/data/```
uploads provenance
- ```java -jar binaries/IRODSTransfer.jar --push --files PROVENANCE/ --irods /tempZone/Projects/P_IRODSRUNNER/I_Test_investigation/S_BABY_X/Amplicon/A_amp1bx/Processed/HASHSAH```
!workflow.workflow_ngtax
destination: /tempZone/Projects/P_Transgenerational_nutrition_-_NWO/I_Broiler_vs_Layer/S_L10D5/Amplicon/A_L10D5_amplicon/Processed/AmpliconAnalysis_NGTAX_700
files:
- /tempZone/Projects/P_Transgenerational_nutrition_-_NWO/I_Broiler_vs_Layer/S_L10D5/Amplicon/A_L10D5_amplicon/Unprocessed/L10D5_amplicon_r.fastq.gz
- /tempZone/Projects/P_Transgenerational_nutrition_-_NWO/I_Broiler_vs_Layer/S_L10D5/Amplicon/A_L10D5_amplicon/Unprocessed/L10D5_amplicon_f.fastq.gz
for_read_len: 70
forward_primer: '[AG]GGATTAGATACCC'
reference_db: /Databases/Silva/SILVA_132_SSURef_tax_silva.fasta.gz
rev_read_len: 70
reverse_primer: CGAC[AG][AG]CCATGCA[ACGT]CACCT
sample: L10D5_amplicon
#!/usr/bin/env cwl-runner
cwlVersion: v1.0
class: CommandLineTool
label: "Genome conversion"
doc: |
Runs Genome conversion tool from SAPP
requirements:
- class: InlineJavascriptRequirement
- class: InitialWorkDirRequirement
listing:
- entry: "$({class: 'Directory', listing: []})"
entryname: Step_$(inputs.step)_Conversion
writable: true
hints:
DockerRequirement:
dockerPull: wurssb/unlock_base
$namespaces:
s: https://schema.org/
edam: http://edamontology.org/
#stdout: ngtax.log
#stderr: ngtax.error
inputs:
step:
type: int
doc: workflow step number
label: The step number in the workflow
embl:
type: File
doc: Reference genome file used in EMBL format
label: Reference genome
inputBinding:
prefix: -input
identifier:
type: string
doc: Name of the sample being analysed
label: Sample name
inputBinding:
prefix: -id
arguments: ["java", "-Xmx1g", "-jar", "/binaries/Conversion.jar","-embl2hdt", "-output", Step_$(inputs.step)_Conversion/$(inputs.embl_file.basename).hdt]
outputs:
#info:
# type: stdout
#error:
# type: stderr
upload:
type: Directory
outputBinding:
glob: Step_$(inputs.step)_Conversion
#!/usr/bin/env cwl-runner
cwlVersion: v1.0
class: CommandLineTool
label: "FASTQC run"
requirements:
- class: InlineJavascriptRequirement
- class: InitialWorkDirRequirement
listing:
- entry: "$({class: 'Directory', listing: []})"
entryname: Step_$(inputs.step)_QualityControl
writable: true
doc: |
Performs quality control on FASTQ file
#hints:
# DockerRequirement:
# dockerPull: wurssb/unlock_base
$namespaces:
s: https://schema.org/
edam: http://edamontology.org/
stdout: fastqc.log
stderr: fastqc.error
inputs:
step:
type: int
doc: workflow step number
label: The step number in the workflow
fastqs:
type: File[]
doc: FastQ file directory
label: FASTQ file directory
inputBinding:
position: 100
arguments: ["/binaries/linux/FastQC-0.11.8/fastqc", "--outdir", Step_$(inputs.step)_QualityControl]
outputs:
info:
type: stdout
error:
type: stderr
upload:
type: Directory
outputBinding:
glob: Step_$(inputs.step)_QualityControl
\ No newline at end of file
#!/usr/bin/env cwl-runner
cwlVersion: v1.0
class: CommandLineTool
label: "Genome annotation with intepro"
doc: |
Runs Genome interpro annotation tool from SAPP
requirements:
- class: InlineJavascriptRequirement
- class: InitialWorkDirRequirement
listing:
- entry: "$({class: 'Directory', listing: []})"
entryname: Step_$(inputs.step)_InterPro
writable: true
hints:
DockerRequirement:
dockerPull: wurssb/unlock_base
$namespaces:
s: https://schema.org/
edam: http://edamontology.org/
#stdout: ngtax.log
#stderr: ngtax.error
inputs:
step:
type: int
doc: workflow step number
label: The step number in the workflow
hdt:
type: File
doc: Reference genome file used in HDT format
label: Reference genome
inputBinding:
prefix: -input
applications:
type: string
doc: Intepro tools to be used
label: Interpro tools
default: TIGRFAM,PFAM,SUPERFAMILY
inputBinding:
prefix: -applications
arguments: ["java", "-Xmx1g", "-jar", "/binaries/InterProScan.jar", "-output", Step_$(inputs.step)_InterPro/$(inputs.hdt.basename).interproscan.hdt]
outputs:
#info:
# type: stdout
#error:
# type: stderr
upload:
type: Directory
outputBinding:
glob: Step_$(inputs.step)_InterPro
#!/usr/bin/env cwl-runner
cwlVersion: v1.0
class: CommandLineTool
label: "Obtain data from irods"
doc: |
Obtain the files needed for the workflow from irods
requirements:
- class: InlineJavascriptRequirement
- class: InitialWorkDirRequirement
listing:
- entry: "$({class: 'Directory', listing: []})"
entryname: "Unprocessed"
writable: true
#hints:
# DockerRequirement:
# dockerPull: wurssb/unlock_base
$namespaces:
s: https://schema.org/
edam: http://edamontology.org/
stdout: irods.log
stderr: irods.error
inputs:
pull:
type: boolean?
label: --pull for downloading data
inputBinding:
position: 1
prefix: --pull
files:
type: string[]
doc: files to download
label: file paths
inputBinding:
position: 2
prefix: --values
arguments: ["java","-jar","/binaries/IRODSTransfer.jar", "--local","Unprocessed"]
outputs:
info:
type: stdout
error:
type: stderr
files:
type: File[]
outputBinding:
glob: "Unprocessed/*"
folder:
type: Directory
outputBinding:
glob: "Unprocessed"
\ No newline at end of file
#!/usr/bin/env cwl-runner
cwlVersion: v1.0
class: CommandLineTool
label: "Obtain data from irods"
doc: |
Obtain the files needed for the workflow from irods
requirements:
- class: MultipleInputFeatureRequirement
- class: InlineJavascriptRequirement
- class: InitialWorkDirRequirement
listing:
- entry: "$({class: 'Directory', listing: []})"
entryname: "Unprocessed"
writable: true
#hints:
# DockerRequirement:
# dockerPull: wurssb/unlock_base
$namespaces:
s: https://schema.org/
edam: http://edamontology.org/
#stdout: irods.log
#stderr: irods.error
inputs:
push:
type: boolean?
label: --push for downloading data
inputBinding:
position: 1
prefix: --push
folders:
type: Directory[]
doc: files/folders to download
label: paths to upload
inputBinding:
position: 2
prefix: --values
destination:
type: string
doc: Final destination on iRODS
label: IRODS final destination
inputBinding:
position: 3
prefix: --irods
arguments: ["java","-jar","/binaries/IRODSTransfer.jar"]
outputs: []
#outputs:
# info:
# type: stdout
# error:
# type: stderr
\ No newline at end of file
#!/usr/bin/env cwl-runner
cwlVersion: v1.0
class: CommandLineTool
label: "NGTax run"
doc: |
Runs NGTAX amplicon analysis
requirements:
- class: InlineJavascriptRequirement
- class: InitialWorkDirRequirement
listing:
- entry: "$({class: 'Directory', listing: []})"
entryname: Step_$(inputs.step)_Classification
writable: true
#hints:
# DockerRequirement:
# dockerPull: wurssb/unlock_base
$namespaces:
s: https://schema.org/
edam: http://edamontology.org/
#stdout: ngtax.log
#stderr: ngtax.error
inputs:
step:
type: int
doc: workflow step number
label: The step number in the workflow
forward_primer: # "[AG]GGATTAGATACCC"
type: string
doc: Forward primer used
label: The forward primer used
inputBinding:
prefix: -for_p
reverse_primer: # "CGAC[AG][AG]CCATGCA[ACGT]CACCT"
type: string
doc: Reverse primer used
label: The reverse primer used
inputBinding:
prefix: -rev_p
reference_db:
type: string
doc: Reference database used in FASTA format
label: Reference database
inputBinding:
prefix: -refdb
folder:
type: Directory
doc: Demultiplexed folder
label: Demultiplexed folder
inputBinding:
prefix: -folder
rev_read_len:
type: int
doc: Read length of the reverse read
label: Reverse read length
inputBinding:
prefix: -rev_read_len
for_read_len:
type: int
doc: Read length of the reverse read
label: Reverse read length
inputBinding:
prefix: -for_read_len
sample:
type: string
doc: Name of the sample being analysed
label: Sample name
arguments: ["java", "-Xmx2g", "-jar", "/binaries/NGTax-2.0.95.jar","-ngtax", "-primersRemoved", "-mapFile", "cwl_mapping_file.txt", "-b", Step_$(inputs.step)_Classification/$(inputs.sample).biom, "-t", Step_$(inputs.step)_Classification/$(inputs.sample).ttl ]
outputs:
#info:
# type: stdout
#error:
# type: stderr
upload:
type: Directory
outputBinding:
glob: Step_$(inputs.step)_Classification
#!/usr/bin/env cwl-runner
cwlVersion: v1.0
class: CommandLineTool
label: "Genome conversion"
doc: |
Runs Genome conversion tool from SAPP
requirements:
- class: InlineJavascriptRequirement
- class: InitialWorkDirRequirement
listing:
- entry: "$({class: 'Directory', listing: []})"
entryname: Step_$(inputs.step)_GenePrediction
writable: true
hints:
DockerRequirement:
dockerPull: wurssb/unlock_base
$namespaces:
s: https://schema.org/
edam: http://edamontology.org/
#stdout: ngtax.log
#stderr: ngtax.error
inputs:
step:
type: int
doc: workflow step number
label: The step number in the workflow
hdt:
type: File
doc: Reference genome file used in HDT format
label: Reference genome
inputBinding:
prefix: -input
codon:
type: int
doc: Codon table to be used
label: Codon table
inputBinding:
prefix: -codon
arguments: ["java", "-Xmx1g", "-jar", "/binaries/genecaller.jar","-single", "-prodigal", "-output", Step_$(inputs.step)_GenePrediction/$(inputs.hdt.basename).prodigal.hdt]
outputs:
#info:
# type: stdout
#error:
# type: stderr
upload:
type: Directory
outputBinding:
glob: Step_$(inputs.step)_GenePrediction
#!/usr/bin/env cwl-runner
cwlVersion: v1.0
class: Workflow
requirements:
- class: StepInputExpressionRequirement
- class: InlineJavascriptRequirement
- class: MultipleInputFeatureRequirement
inputs:
files:
type: string[]
doc: files to download
label: file paths
destination:
type: string
doc: Final destination on iRODS
label: IRODS final destination
forward_primer:
type: string
doc: Forward primer used
label: Forward primer
default: "[AG]GGATTAGATACCC"
reverse_primer:
type: string
doc: Reverse primer used
label: Reverse primer
default: "CGAC[AG][AG]CCATGCA[ACGT]CACCT"
reference_db:
type: string
doc: Reference database used in FASTA format
label: Reference database
default: "/Databases/Silva/SILVA_132_SSURef_tax_silva.fasta.gz"
rev_read_len:
type: int
doc: Read length of the reverse read
label: Reverse read length
for_read_len:
type: int
doc: Read length of the reverse read
label: Reverse read length
sample:
type: string
doc: Name of the sample being analysed
label: Sample name
outputs:
info:
type: string
error:
type: string
steps:
############################
irods_download:
run: irods.cwl
in:
files: files
pull:
default: true
out: [files, folder]
############################
fastqc:
run: fastqc.cwl
in:
step:
default: 1
fastqs: irods_download/files
out: [upload]
############################
ngtax:
run: ngtax.cwl
in:
step:
default: 2
forward_primer: forward_primer
reverse_primer: reverse_primer
reference_db: reference_db
folder: irods_download/folder
rev_read_len: rev_read_len
for_read_len: for_read_len
sample: sample
out: [upload]
############################
irods_upload:
run: irods_upload.cwl
in:
folders: [fastqc/upload, ngtax/upload]
destination: destination
push:
default: true
out: []
############################
\ No newline at end of file