Tommy Tang
snakemake_DNAseq_pipeline

Repository

echo $PATH
ssh shark.mdanderson.org

# start a screen session
screen

# make a folder, name it yourself, I named it workdir
mkdir /rsch2/genomic_med/krai/workdir/

cd /rsch2/genomic_med/krai/workdir/

git clone https://gitlab.com/tangming2005/snakemake_DNAseq_pipeline

cd snakemake_DNAseq_pipeline

## go to mutect branch
git checkout mutect

## edit the config.yaml file as needed, e.g. set mouse or human for ref genome, p value cut off for peak calling, the number of reads you want to downsample to
nano config.yaml

## skip this if on Shark, samir has py351 set up for you. see below STEPS
conda create -n snakemake python=3 snakemake
source activate snakemake
source activate py351

## for paired end fastqs

"sample_name":
{"normal":
    {"R1":["normal1_R1.fastq", "normal2_R1.fastq"],
    "R2":["normal1_R2.fastq", "normal2_R2.fastq"]},
  "tumor":
    {"R1":["tumor1_R1.fastq", "tumor_2_R1.fastq"],
    "R2":["tumor1_R2.fastq", "tumor_2_R2.fastq"] }
}

## for single end fastqs:
"sample_name":
{"normal":
    {"R1":["normal1_R1.fastq", "normal2_R1.fastq"]},
  "tumor":
    {"R1":["tumor1_R1.fastq", "tumor_2_R1.fastq"]}
}

# for bam files:

"sample_name":
{"normal": ["normal1.bam"],
  "tumor":  ["tumor1.bam"]}
}
cat sample_info.txt
sample_name file_name sample_type
patient1  B17042956985-KY290-2  leukocyte
patient1  P17042956986-KY290-2  ctDNA
patient1  F17042956987-KY290-2  pre-treatment
patient1  F17042956988-KY290-2  recurrence
patient2  B17050457295-KY290-2  leukocyte
patient2  P17050457296-KY290-2  ctDNA
patient2  F17042956989-KY290-2  pre-treatment
patient2  F17042956990-KY290-2  recurrence
patient3  B17051358726-KY290-2  leukocyte
patient3  P17051358727-KY290-2  ctDNA
patient3  F17051358728-KY290-2  pre-treatment
patient3  F17051358729-KY290-2  metastasis
patient3  F17051859286-KY290-2  pre-treatment2
patient4  B17031650918-KY290  leukocyte
patient4  P17031650914-CLN-KY290  ctDNA
patient4  F17031650907-KY290  pre-treatment
patient4  F17031650908-KY290  recurrence
patient4  F17031650909-KY290  metastasis
patient5  B17031650917-KY290  leukocyte
patient5  P17031650913-CLN-KY290  ctDNA
patient5  F17031650905-KY290  pre-treatment
patient5  F17031650906-KY290  recurrence

less -S samples.json
## dry run
snakemake -np
module load drmma
./pyflow-drmaa-ChIPseq.sh
./pyflow-ChIPseq.sh
bkill `bjobs -u krai |grep PEND |cut -f1 -d" "`
bjobs -pl
 Display detailed information of all pending jobs of the invoker.

bjobs -ps
 Display only pending and suspended jobs.

bjobs -u all -a
 Display all jobs of all users.

bjobs -d -q short -m apple -u mtang1
 Display all the recently finished jobs submitted by john to the
 queue short, and executed on the host apple.

# specify the name of the rule, all files that associated with that rule will be rerun. e.g. rerun macs2 calling peaks rule,
./pyflow-DNAseq.sh -R freebayes

## rerun one sample, just specify the name of the target file

./pyflow-DNAseq.sh 08vcfanno/SKCM-M409-P011_tumor.vep.anno.vcf

##rerun only vcfanno rule
./pyflow-DNAseq.sh -f vcfanno

## check snakemake -h
## -R -f -F --until are useufl

snakemake --summary | sort -k1,1 | less -S

# or detailed summary will give you the commands used to generated the output and what input is used
snakemake --detailed-summary | sort -k1,1 > snakemake_run_summary.txt
find . -maxdepth 1 -type d -name "[0-9]*" | xargs echo rm -rf
snakemake -n -R `snakemake --list-input-changes`

snakemake -n -R `snakemake --list-params-changes`
$ snakemake -n -R `snakemake --list-code-changes`

# download the bed file into the files folder in the repo
wget https://support.illumina.com/content/dam/illumina-support/documents/documentation/chemistry_documentation/samplepreps_nextera/nexterarapidcapture/nexterarapidcapture_exome_targetedregions_v1.2.bed

# create bed files by chromosome
cat nexterarapidcapture_exome_targetedregions_v1.2.bed | sort -k1,1 -k2,2n | sed 's/^chr//' | sed 's/^M/MT/' | awk '{close(f);f=$1}{print > f".bed"}'

# 1.bed, 2.bed, 3.bed ...X.bed, Y.bed, MT.bed will be created.


wget https://www.idtdna.com/pages/docs/default-source/xgen-libraries/xGen-Lockdown-Panels/xgen-exome-research-panel-targets.bed?sfvrsn=6

mv xgen-exome-research-panel-targets.bed\?sfvrsn\=6 xgen-exome-research-panel-targets.bed

## no chrM is the captured region. change CHR in the config.ymal files accordingly.

cat xgen-exome-research-panel-targets.bed | cut -f1-3 | sort -k1,1 -k2,2n | sed 's/^chr//' | awk '{close(f);f=$1}{print > f".bed"}'

cat nexterarapidcapture_exome_targetedregions_v1.2.bed | sort -k1,1 -k2,2n | awk '{close(f);f=$1}{print > f".bed"}'
cat xgen-exome-research-panel-targets.bed |cut -f1-3 | sort -k1,1 -k2,2n |  sed 's/^chr//' |  awk '{print $1":"$2"-"$3}' > xgen-exome-research-panel-targets_no_chr_freebayes.txt
cat xgen-exome-research-panel-targets.bed | cut -f1-3 | sort -k1,1 -k2,2n | awk '{print$1"\t"$2-300"\t"$3+300}' | sed 's/^chr//' | awk '{close(f);f=$1}{print > f"_extend.bed"}'
wget https://raw.githubusercontent.com/hall-lab/speedseq/master/annotations/ceph18.b37.include.2014-01-15.bed

cat ceph18.b37.include.2014-01-15.bed | sort -k1,1 -k2,2n | awk '{close(f);f=$1}{print > f".bed"}'

cat ceph18.b37.include.2014-01-15.bed | cut -f1-3 | sort -k1,1 -k2,2n | awk '{print $1":"$2"-"$3}' > ceph18.b37.include.2014-01-15_for_freebayes.txt

* `-nt / --num_threads`
controls the number of data threads sent to the processor

* `-nct / --num_cpu_threads_per_data_thread`
controls the number of CPU threads allocated to each data thread
freebayes -f hg19.fa --genotype-qualities --strict-vcf --ploidy 2 --targets regions.bed --min-repeat-entropy 1 --no-partial-observations --min-alternate-fraction 0.05 --pooled-discrete --pooled-continuous --report-genotype-likelihood-max --allele-balance-priors-off tumor.bam normal.bam | bcftools filter -i 'ALT="<*>" || QUAL > 5'
@RG ID:H0164.2  PL:illumina PU:H0164ALXX140820.2    LB:Solexa-272222    PI:0    DT:2014-08-20T00:00:00-0400 SM:NA12878  CN:BI
* ID = Read group identifier
This tag identifies which read group each read belongs to, so each read group's ID must be unique. It is referenced both in the read group definition line in the file header (starting with @RG) and in the RG:Z tag for each read record. Note that some Picard tools have the ability to modify IDs when merging SAM files in order to avoid collisions. In Illumina data, read group IDs are composed using the flowcell + lane name and number, making them a globally unique identifier across all sequencing data in the world.
Use for BQSR: ID is the lowest denominator that differentiates factors contributing to technical batch effects: therefore, a read group is effectively treated as a separate run of the instrument in data processing steps such as base quality score recalibration, since they are assumed to share the same error model.

* PU = Platform Unit
The PU holds three types of information, the {FLOWCELL_BARCODE}.{LANE}.{SAMPLE_BARCODE}. The {FLOWCELL_BARCODE} refers to the unique identifier for a particular flow cell. The {LANE} indicates the lane of the flow cell and the {SAMPLE_BARCODE} is a sample/library-specific identifier. Although the PU is not required by GATK but takes precedence over ID for base recalibration if it is present. In the example shown earlier, two read group fields, ID and PU, appropriately differentiate flow cell lane, marked by .2, a factor that contributes to batch effects.

* SM = Sample
The name of the sample sequenced in this read group. GATK tools treat all read groups with the same SM value as containing sequencing data for the same sample, and this is also the name that will be used for the sample column in the VCF file. Therefore it's critical that the SM field be specified correctly. When sequencing pools of samples, use a pool name instead of an individual sample name.

* PL = Platform/technology used to produce the read
This constitutes the only way to know what sequencing technology was used to generate the sequencing data. Valid values: ILLUMINA, SOLID, LS454, HELICOS and PACBIO.

* LB = DNA preparation library identifier
MarkDuplicates uses the LB field to determine which read groups might contain molecular duplicates, in case the same DNA library was sequenced on multiple lanes.
java -jar picard.jar AddOrReplaceReadGroups \
    INPUT=reads.bam \
    OUTPUT=reads_addRG.bam \
    RGID=H0164.2 \ #be sure to change from default of 1
    RGLB= library1 \
    RGPL=illumina \
    RGPU=H0164ALXX140820.2 \
    RGSM=sample1 \
H0164ALXX140820:2:1101:10003:23460
H0164ALXX140820:2:1101:15118:25288
H0164____________ #portion of @RG ID and PU fields indicating Illumina flow cell
_____ALXX140820__ #portion of @RG PU field indicating barcode or index in a multiplexed run
_______________:2 #portion of @RG ID and PU fields indicating flow cell lane
Dad's data:
@RG     ID:FLOWCELL1.LANE1      PL:ILLUMINA     LB:LIB-DAD-1 SM:DAD      PI:200
@RG     ID:FLOWCELL1.LANE2      PL:ILLUMINA     LB:LIB-DAD-1 SM:DAD      PI:200
@RG     ID:FLOWCELL1.LANE3      PL:ILLUMINA     LB:LIB-DAD-2 SM:DAD      PI:400
@RG     ID:FLOWCELL1.LANE4      PL:ILLUMINA     LB:LIB-DAD-2 SM:DAD      PI:400

Mom's data:
@RG     ID:FLOWCELL1.LANE5      PL:ILLUMINA     LB:LIB-MOM-1 SM:MOM      PI:200
@RG     ID:FLOWCELL1.LANE6      PL:ILLUMINA     LB:LIB-MOM-1 SM:MOM      PI:200
@RG     ID:FLOWCELL1.LANE7      PL:ILLUMINA     LB:LIB-MOM-2 SM:MOM      PI:400
@RG     ID:FLOWCELL1.LANE8      PL:ILLUMINA     LB:LIB-MOM-2 SM:MOM      PI:400

Kid's data:
@RG     ID:FLOWCELL2.LANE1      PL:ILLUMINA     LB:LIB-KID-1 SM:KID      PI:200
@RG     ID:FLOWCELL2.LANE2      PL:ILLUMINA     LB:LIB-KID-1 SM:KID      PI:200
@RG     ID:FLOWCELL2.LANE3      PL:ILLUMINA     LB:LIB-KID-2 SM:KID      PI:400
@RG     ID:FLOWCELL2.LANE4      PL:ILLUMINA     LB:LIB-KID-2 SM:KID      PI:400
conda install -c bioconda gemini

cd /scratch/genomic_med/apps/gemini

gemini update --dataonly

perl annotate_variation.pl -buildver hg19 -downdb -webfrom annovar snp129  humandb/

## change to bed format for vcfanno
less -S hg19_snp129.txt | cut -f2-7 | sed 's/^chr//' > hg19_snp129.bed

## copy it to vcfanno_db: /scratch/genomic_med/apps/gemini/gemini/data/
cp  hg19_snp129.bed /scratch/genomic_med/apps/gemini/gemini/data/
bgzip hg19_snp129.bed
tabix -p bed hg19_snp129.bed.gz
## dbsnp129 hg19 version
[[annotation]]
file="hg19_snp129.bed"
columns=[4]
names=["dbsnp129_rs_ids"]
ops=["concat"]
bcftools filter --soft-filter 'FBQualDepth' -e '(AF[0] <= 0.5 && (DP < 4 || (DP < 13 && %QUAL < 10))) || (AF[0] > 0.5 && (DP < 4 && %QUAL < 50))' -m '+'

function somatic_filter() {
    awk -v MINQUAL="$1" -v SSC_THRES="$2" -v ONLY_SOMATIC="$3" 'BEGIN {NORMAL=10; TUMOR=11; GL_IDX=0;}
    {
        if ($0~"^#") { print ; next; }
        if (! GL_IDX) {
            split($9,fmt,":")
            for (i=1;i<=length(fmt);++i) { if (fmt[i]=="GL") GL_IDX=i }
        }
        split($NORMAL,N,":");
        split(N[GL_IDX],NGL,",");
        split($TUMOR,T,":");
        split(T[GL_IDX],TGL,",");
        LOD_NORM=NGL[1]-NGL[2];
        LOD_TUMOR_HET=TGL[2]-TGL[1];
        LOD_TUMOR_HOM=TGL[3]-TGL[1];
        if (LOD_TUMOR_HET > LOD_TUMOR_HOM) { LOD_TUMOR=LOD_TUMOR_HET }
        else { LOD_TUMOR=LOD_TUMOR_HOM }
        DQUAL=LOD_TUMOR+LOD_NORM;
        if (DQUAL>=SSC_THRES && $NORMAL~"^0/0") {
            $7="PASS"
            $8="SSC="DQUAL";"$8
            print
        }
        else if (!ONLY_SOMATIC && $6>=MINQUAL && $10~"^0/0" && ! match($11,"^0/0")) {
            $8="SSC="DQUAL";"$8
            print
        }
    }' OFS="\t"
}

cd /scratch/genomic_med/apps
git clone https://github.com/Ensembl/ensembl-vep.git
cd ensembl-vep
git status
# the Ensembl API will be installed
perl INSTALL.pl
export VEP_DATA="/scratch/genomic_med/apps/ensembl-vep-data"
export VEP_PATH="/scratch/genomic_med/apps/ensembl-vep"

rsync -avhP rsync://ftp.ensembl.org/ensembl/pub/release-89/variation/VEP/homo_sapiens_vep_89_GRCh37.tar.gz $VEP_DATA
tar -xvzf $VEP_DATA/homo_sapiens_vep_89_GRCh37.tar.gz -C $VEP_DATA
perl INSTALL.pl --AUTO f --SPECIES homo_sapiens --ASSEMBLY GRCh37 --DESTDIR $VEP_PATH --CACHEDIR $VEP_DATA

perl convert_cache.pl --species homo_sapiens --version 89_GRCh37 --dir $VEP_DATA
vep --species homo_sapiens --assembly GRCh37 --offline --no_stats --sift b --ccds --uniprot --hgvs --symbol --numbers --domains --gene_phenotype --canonical --protein --biotype --uniprot --tsl --pubmed --variant_class --shift_hgvs 1 --check_existing --total_length --allele_number --no_escape --xref_refseq --failed 1 --vcf --minimal --flag_pick_allele --pick_order canonical,tsl,biotype,rank,ccds,length --dir $VEP_DATA --fasta $VEP_DATA/homo_sapiens/89_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz --input_file example_GRCh37.vcf --output_file example_GRCh37.vep.vcf --polyphen b --af_1kg --af_esp --regulatory
-------------------- EXCEPTION --------------------
MSG: ERROR: Cannot index bgzipped FASTA file with Bio::DB::Fasta
"cellularity"	"ploidy"	"SLPP"
1	3.8	0.795405007607475
1	1.9	6.67188954220472e-12
1	5.7	3.24694972360125e-14
sequenza.results(....cellularity =1, ploidy = 1.9)