...
 
Commits (2)
......@@ -57,13 +57,14 @@ fi
# call dnadiff to find location of inversions, in terms of querry and reference position
isolatenpath=${query%.fasta} # get isolate ID by stripping suffix
isolate=${isolatenpath##*/} # and stripping path
dnadiff -p $isolate $reference $query
../../inversionfix.py -r $isolate.rdiff -q $isolate.qdiff -v $isolate-temp.vcf -f $query -t $isolate-temp.fasta
#dnadiff -p $isolate $reference $query
#../../inversionfix.py -r $isolate.rdiff -q $isolate.qdiff -v $isolate-temp.vcf -f $query -t $isolate-temp.fasta
dnadiff -p $tmpdir/$isolate $reference $query
../../inversionfix.py -r $tmpdir/$isolate.rdiff -q $tmpdir/$isolate.qdiff -v $tmpdir/$isolate-temp.vcf -f $query -t $tmpdir/$isolate-temp.fasta
# inversionfix.py will need to be made an exacutable in the path eventually
# call $isolate-temp.fasta instead of $query from now on below
read_fasta "$refdir" $reference
read_fasta "$qrydir" $isolate-temp.fasta
read_fasta "$qrydir" $tmpdir/$isolate-temp.fasta
# If we have one reference and one query, don't require matching identifiers
if [ $(ls "$refdir" | wc -l) = 1 ] && [ $(ls "$qrydir" | wc -l) = 1 ]
......@@ -88,6 +89,6 @@ cat <<EOF
EOF
diff -irdb -U1 "$refdir" "$qrydir" \
| udiff2vcf >> $isolate-temp.vcf # append output of diff to temp.vcf file, which has the inversion calls
| udiff2vcf >> $tmpdir/$isolate-temp.vcf # append output of diff to temp.vcf file, which has the inversion calls
sort -k2 -n $isolate-temp.vcf # sort by position, and output to stdout
sort -k2 -n $tmpdir/$isolate-temp.vcf # sort by position, and output to stdout
......@@ -17,37 +17,12 @@ def main():
parser.add_argument('-t', '--tfasta', help='temp fasta file', default='out-temp.fasta')
args = parser.parse_args()
#chromname = 'snps_indels|NC_001416.1|'
chromname = '1'
# parse args.rdiff and make args.vcf with just the inversions, translocations, etc. in vcf format (no header)
# which biodiff will eventually combine with its own vcf from the other differences it finds
vcflines = []
with open(args.rdiff) as rfile:
line2inv = False # dnadiff records inversions in 2 lines. first occurrence is start, 2nd is end.
invstart = 0
for line in rfile:
cleanline = line.strip('\n')
rdiff = cleanline.split('\t')
# rdiff[2] is genome position, rdiff[1] is variation type (GAP, INV, etc)
if rdiff[1] == 'INV' and not line2inv:
vcfstring = chromname + '\t' + rdiff[2] + '\t.\tN\tINV\t.\tPASS\t'
line2inv = True
invstart = int(rdiff[2])
elif rdiff[1] == 'INV':
vcfstring += 'SVLEN=' + str(int(rdiff[2])-invstart) + ';SVTYPE=INV\n'
vcflines.append(vcfstring)
line2inv = False
with open(args.vcf, 'w') as vfile:
for line in vcflines:
vfile.write(line)
# parse args.qdiff to get inversions in querry coordinates
# parse args.qfasta to get fasta of querry sequence,
# Find inversions in terms of Query position, and output temp.fasta with those inversions undone
# parse args.qdiff to get inversions in query coordinates
# parse args.qfasta to get fasta of query sequence,
# undo the inversions found in args.qdiff
# output noninverted fasta to args.tfasta, which biodiff will then use
inversions = [] # stores locations of inversions in querry sequence
translocations = []
breaks = []
inversions = [] # stores locations of inversions in query sequence
with open(args.qdiff) as qfile:
line2inv = False # dnadiff records inversions in 2 lines. first occurrence is start, 2nd is end.
for line in qfile:
......@@ -70,12 +45,34 @@ def main():
after = qseq[invdict['End']:]
qseq = before + fixedinv + after
# next fix translocations
# write out temp fasta
fixedrecord.seq = qseq
SeqIO.write(fixedrecord, args.tfasta, 'fasta')
# Find inversions in terms of Reference position, and output them in VCF format
chromname = qrecord.id
# parse args.rdiff and make args.vcf with just the inversions, translocations, etc. in vcf format (no header)
# which biodiff will eventually combine with its own vcf from the other differences it finds
vcflines = []
with open(args.rdiff) as rfile:
line2inv = False # dnadiff records inversions in 2 lines. first occurrence is start, 2nd is end.
invstart = 0
for line in rfile:
cleanline = line.strip('\n')
rdiff = cleanline.split('\t')
# rdiff[2] is genome position, rdiff[1] is variation type (GAP, INV, etc)
if rdiff[1] == 'INV' and not line2inv:
vcfstring = chromname + '\t' + rdiff[2] + '\t.\tN\tINV\t.\tPASS\t'
line2inv = True
invstart = int(rdiff[2])
elif rdiff[1] == 'INV':
vcfstring += 'SVLEN=' + str(int(rdiff[2]) - invstart) + ';SVTYPE=INV\n'
vcflines.append(vcfstring)
line2inv = False
with open(args.vcf, 'w') as vfile:
for line in vcflines:
vfile.write(line)
if __name__ == '__main__':
main()