variantfilter.py adds flags to variants that are near inversions, and limits...

variantfilter.py adds flags to variants that are near inversions, and limits the raw bases shown in each variant
parent 60dbf8cd
......@@ -12,8 +12,6 @@ unset tmpdir
tmpdir=$(mktemp -p /tmp -d biodiff.XXXXXX)
refdir="$tmpdir/ref"
qrydir="$tmpdir/qry"
#refdir=refdirect
#qrydir=querydirect
cleanup () {
......@@ -57,8 +55,6 @@ fi
# call dnadiff to find location of inversions, in terms of querry and reference position
isolatenpath=${query%.fasta} # get isolate ID by stripping suffix
isolate=${isolatenpath##*/} # and stripping path
#dnadiff -p $isolate $reference $query
#../../inversionfix.py -r $isolate.rdiff -q $isolate.qdiff -v $isolate-temp.vcf -f $query -t $isolate-temp.fasta
dnadiff -p $tmpdir/$isolate $reference $query
../../inversionfix.py -r $tmpdir/$isolate.rdiff -q $tmpdir/$isolate.qdiff -v $tmpdir/$isolate-temp.vcf -f $query -p $reference -t $tmpdir/$isolate-temp.fasta
# inversionfix.py will need to be made an exacutable in the path eventually
......@@ -85,10 +81,15 @@ fi
cat <<EOF
##fileformat=VCFv4.2
##source=biodiffV${VERSION}
##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of Structural Variation"
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Lable of Structural Variation Type. Key: INV=Inversion"
##INFO=<ID=Ambiguity,Number=1,Type=String,Description="Reason for Ambiguous Variant Call. Key: Inside_Inversion=Biodiff assumes this variant occured after the surrounding inversion event however the true phylogeny is uncertain, Near_Inversion=Biodiff still produces erroneous SNP and INDEL calls near the edges of inversions"
#CHROM POS ID REF ALT QUAL FILTER INFO
EOF
diff -irdb -U1 "$refdir" "$qrydir" \
| udiff2vcf >> $tmpdir/$isolate-temp.vcf # append output of diff to temp.vcf file, which has the inversion calls
sort -k2 -n $tmpdir/$isolate-temp.vcf # sort by position, and output to stdout
python ../../variantfilter.py -i $tmpdir/$isolate-temp.vcf -o $tmpdir/$isolate-temp-filtered.vcf
sort -k2 -n $tmpdir/$isolate-temp-filtered.vcf # sort by position, and output to stdout
......@@ -78,8 +78,9 @@ def main():
rdiff = cleanline.split('\t')
# rdiff[2] is genome position, rdiff[1] is variation type (GAP, INV, etc)
if rdiff[1] == 'INV' and not line2inv:
invstart = refineposition(args.qfasta, rseq, int(rdiff[2])) # blast position in reference to query sequence
vcfstring = chromname + '\t' + str(invstart) + '\t.\tN\tINV\t.\tPASS\t'
invstart = refineposition(args.qfasta, rseq, int(rdiff[2]))
# refinepositition() will blast region around INV start in reference to query sequence
vcfstring = chromname + '\t' + str(invstart) + '\t.\tN\t<INV>\t.\tPASS\t'
line2inv = True
elif rdiff[1] == 'INV':
invend = refineposition(args.qfasta, rseq, int(rdiff[2]))
......
#!/usr/bin/env python
import argparse
def main():
parser = argparse.ArgumentParser(description='Adjusts output of udiff2vcf',
epilog='Report only position and length for variants over 10bp long.')
parser.add_argument('-i', '--invcf', help='input vcf file from udiff2vcf')
parser.add_argument('-o', '--ovcf', help='output vcf file (no header)')
args = parser.parse_args()
outlist = list() # list to store modified lines for output as new vcf file
inversionlist = list() # list to store inversion positions, to check if variants are inside or near them
with open(args.invcf) as infile:
for line in infile:
cleanline = line.strip('\n')
vcfbits = cleanline.split('\t')
if vcfbits[4] == '<INV>': # parse inversion entries to create list of inversion positions
infobits = vcfbits[7].split(';')
invlength = infobits[0].split('=')
inversiondict = dict(Start=int(vcfbits[1]), End=int(vcfbits[1])+int(invlength[1]))
inversionlist.append(inversiondict)
outlist.append(line)
# Note: All SV entrees are at the start of the input file
else: # parse other entries and adjust based on length of REF/ALT and position relative to inversions
if len(vcfbits[3]) > 10 or len(vcfbits[4]) > 10:
vcfbits[3] = '.'
vcfbits[4] = '.'
vcfbits[7] = 'SVLEN=' + str(len(vcfbits[4]))
intrainv = False # flag to track if variant is inside an inversion
nearinv = False # flag to track if variant is near the edge of an inversion
for inv in inversionlist:
if inv['Start'] < int(vcfbits[1]) < inv['End']:
intrainv = True
if inv['Start'] - 10 < int(vcfbits[1]) < inv['Start'] + 1:
nearinv = True
if inv['End'] - 1 < int(vcfbits[1]) < inv['End'] + 10:
nearinv = True
if intrainv and vcfbits[7] == '.': # replace the . if the INFO column is empty, append otherwise
vcfbits[7] = 'Ambiguity=Inside_Inversion'
elif intrainv:
vcfbits[7] += ';Ambiguity=Inside_Inversion'
if nearinv and vcfbits[7] == '.': # replace the . if the INFO column is empty, append otherwise
vcfbits[7] = 'Ambiguity=Near_Inversion'
elif nearinv:
vcfbits[7] += ';Ambiguity=Near_Inversion'
newline = '\t'.join(vcfbits) + '\n' # convert entry back to string for writing
outlist.append(newline)
# write new temp vcf
with open(args.ovcf, 'w') as vfile:
for oline in outlist:
vfile.write(oline)
if __name__ == '__main__':
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment