biodiff now integrates record of inversions into vcf format output

inversionfix.py now in addition to creating a temp.fasta file
with the inversions undone, which the rest of biodiff then uses,
but now also outputs a temp.vcf file which records the inversions,
in terms of the reference positions, in vcf format.

biodiff now append its udiff2vcf to temp.vcf, which is then
sorted by postion (2nd column) and output to standard out.

Tested with test/lambda-phage/inversion.fasta

Currently the CHROM value of the inversion entries do not match
the other entries in the vcf output. Next change to inversionfix.py
will have it read in the querry fasta file to get the correct
chromosome name, to match the rest of biodiff.

Eventually inversionfix.py should use a blast of a small (100 bp)
section of the querry centered around the inversion starts and ends
against the reference, to more precisely find the breakpoints.
The positions provided by dnadiff (nucmer) are often off by a few
bases, resulting in several incorrect biodiff calls around the edges
of inversions (though not nearly as many as before).
parent 95805f8e
......@@ -54,12 +54,11 @@ then
usage
fi
# here we can put the dnadiff call etc
# call dnadiff to find location of inversions, in terms of querry and reference position
isolatenpath=${query%.fasta} # get isolate ID by stripping suffix
isolate=${isolatenpath##*/} # and stripping path
dnadiff -p $isolate $reference $query
echo "dnadiff finished"
./inversionfix.py -r $isolate.rdiff -q $isolate.qdiff -v $isolate-temp.vcf -f $query -t $isolate-temp.fasta
../../inversionfix.py -r $isolate.rdiff -q $isolate.qdiff -v $isolate-temp.vcf -f $query -t $isolate-temp.fasta
# inversionfix.py will need to be made an exacutable in the path eventually
# call $isolate-temp.fasta instead of $query from now on below
......@@ -89,4 +88,6 @@ cat <<EOF
EOF
diff -irdb -U1 "$refdir" "$qrydir" \
| udiff2vcf # | python inversionsertion.py
| udiff2vcf >> $isolate-temp.vcf # append output of diff to temp.vcf file, which has the inversion calls
sort -k2 -n $isolate-temp.vcf # sort by position, and output to stdout
......@@ -17,20 +17,26 @@ def main():
parser.add_argument('-t', '--tfasta', help='temp fasta file', default='out-temp.fasta')
args = parser.parse_args()
print(args)
chromname = 'snps_indels|NC_001416.1|'
#chromname = 'snps_indels|NC_001416.1|'
chromname = '1'
# parse args.rdiff and make args.vcf with just the inversions, translocations, etc. in vcf format (no header)
# which biodiff will eventually combine with its own vcf from the other differences it finds
vcflines = []
with open(args.rdiff) as rfile:
line2inv = False # dnadiff records inversions in 2 lines. first occurrence is start, 2nd is end.
invstart = 0
for line in rfile:
cleanline = line.strip('\n')
rdiff = cleanline.split('\t')
#rdiff[2] is genome position, rdiff[1] is variation type (GAP, INV, etc), rdiff[4] is length (in reference)
if rdiff[1] == 'INV' or rdiff[1] == 'JMP' or rdiff[1] == 'BRK':
vcfstring = chromname + '\t' + rdiff[2] + '\t.\t.\t.\t.\tPASS\t' \
+ rdiff[1] + '|' + 'Length:' + rdiff[4] + '\n'
# rdiff[2] is genome position, rdiff[1] is variation type (GAP, INV, etc)
if rdiff[1] == 'INV' and not line2inv:
vcfstring = chromname + '\t' + rdiff[2] + '\t.\tN\tINV\t.\tPASS\t'
line2inv = True
invstart = int(rdiff[2])
elif rdiff[1] == 'INV':
vcfstring += 'SVLEN=' + str(int(rdiff[2])-invstart) + ';SVTYPE=INV\n'
vcflines.append(vcfstring)
line2inv = False
with open(args.vcf, 'w') as vfile:
for line in vcflines:
vfile.write(line)
......@@ -43,33 +49,18 @@ def main():
translocations = []
breaks = []
with open(args.qdiff) as qfile:
line2inv = 0
line2inv = False # dnadiff records inversions in 2 lines. first occurrence is start, 2nd is end.
for line in qfile:
cleanline = line.strip('\n')
qdiff = cleanline.split('\t')
if qdiff[1] == 'INV' and line2inv == 0: # INV ifo in 2 lines. first line has Start, 2nd has End
if qdiff[1] == 'INV' and not line2inv: # INV ifo in 2 lines. first line has Start, 2nd has End
vardict = dict(Start=int(qdiff[2]))
inversions.append(vardict)
line2inv = 1
line2inv = True
elif qdiff[1] == 'INV':
inversions[-1]['End'] = int(qdiff[2])
line2inv = 0
line2inv = False
# with open(args.qdiff) as qfile:
# for line in qfile:
# cleanline = line.strip('\n')
# qdiff = cleanline.split('\t')
# if int(qdiff[4]) < 0: # negative inversions have earlier genome position as END position. fixing
# vardict = dict(Start=int(qdiff[3]), End=int(qdiff[2]), Length=int(qdiff[4]))
# else:
# vardict = dict(Start=int(qdiff[2]), End=int(qdiff[3]), Length=int(qdiff[4]))
# if qdiff[1] == 'INV' and abs(vardict['Length']) > 2:
# inversions.append(vardict)
# print(vardict)
# if qdiff[1] == 'JMP':
# translocations.append(vardict)
# if qdiff[1] == 'BRK':
# breaks.append(vardict)
qrecord = SeqIO.parse(args.qfasta, 'fasta').next() # read in the fasta file, should read only the first contig
qseq = qrecord.seq # get the sequence of the querry genome
fixedrecord = qrecord
......@@ -78,6 +69,7 @@ def main():
fixedinv = qseq[invdict['Start']:invdict['End']].reverse_complement()
after = qseq[invdict['End']:]
qseq = before + fixedinv + after
# next fix translocations
# write out temp fasta
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment