Commit 0c419679 authored by Mathieu Courcelles's avatar Mathieu Courcelles

Alpha version of FASTA database import.

parent 1f9f25e2
......@@ -15,3 +15,9 @@ CLMSpipeline/settings/local.py
CLMSpipeline/settings/production.py
CLMSpipeline/settings/staging.py
CLMSpipeline/settings/test.py
CLMSpipeline_20131015.db
testdata.json
build
site_media
site_media/FastaDB
site_media/RawDataset
......@@ -17,7 +17,7 @@ from django.dispatch import receiver
# Import project libraries
from .models import CLPeptideFilterParam, RawDataset, ProcessedDataset,\
CrossLinker, FastaDB, Instrument, CLPeptide,\
CrossLinker, FastaDB, FastaDb_Sequence, Instrument, CLPeptide,\
SearchAlgorithm, Project, CLPeptideFilter
from .parser.parser_generic import parser_generic
from .DatasetProcessing import DatasetProcessing
......@@ -257,7 +257,7 @@ class RawDatasetAdmin(admin.ModelAdmin):
"""
if instance.parsing_status == False and created == False:
instance.parsing_log = parser_generic.parseResults(instance)
parser_generic.parseResults(instance)
......@@ -395,11 +395,78 @@ class ProcessedDatasetAdmin(admin.ModelAdmin):
class FastaDBAdmin(admin.ModelAdmin):
"""
Admin panel to upload FASTA database.
"""
date_hierarchy = 'creation_date'
list_display = ('pk', 'name', 'file', 'sequence_count', 'parsing_status', 'creation_date')
list_filter = ('parsing_status',)
readonly_fields = ('parsing_log', 'parsing_status', 'sequence_count')
search_fields = ('name', 'file')
def get_readonly_fields(self, request, obj=None):
"""
This method sets some fields read only after FASTA file upload.
"""
readonly_fields = list(self.readonly_fields)
if obj != None:
readonly_fields.extend(['file'])
return readonly_fields
@staticmethod
@receiver(post_save, sender=FastaDB)
def process_file(sender, instance, created, **kwargs):
"""
This method initiates the parsing of uploaded FASTA file.
"""
if instance.parsing_status == False and created == False:
parser_generic.parseFASTA(instance)
def save_model(self, request, obj, form, change):
if obj.update == True:
obj.refreshParsing()
obj.update = False
super(FastaDBAdmin, self).save_model(request, obj, form, change)
class FastaDb_SequenceAdmin(admin.ModelAdmin):
"""
Admin panel to check uploaded FASTA sequence.
"""
list_display = ('pk', 'fastadb', 'identifier', 'gene_name', 'description', 'species')
list_filter = ('fastadb', 'species')
search_fields = ('identifier', 'gene_name', 'description', 'species')
## Register admin panels
admin.site.register(RawDataset, RawDatasetAdmin)
admin.site.register(ProcessedDataset, ProcessedDatasetAdmin)
admin.site.register(CrossLinker)
admin.site.register(FastaDB)
admin.site.register(FastaDB, FastaDBAdmin)
admin.site.register(FastaDb_Sequence, FastaDb_SequenceAdmin)
admin.site.register(Instrument)
admin.site.register(CLPeptide, CLPeptideAdmin)
admin.site.register(SearchAlgorithm)
......
......@@ -57,7 +57,7 @@ def compareRunIds_csv(self, request, queryset, tableName):
uniquePep_dict[unique_key][dataset_run_name] = True
print uniquePep_dict
#print uniquePep_dict
# Write header
writer.writerow(header_list.keys())
......
......@@ -7,6 +7,7 @@ Data model module for the CLMSpipeline_app.
# Import standard librariesdjang
import re
# Import Django related libraries
from django.contrib.contenttypes.models import ContentType
......@@ -25,7 +26,8 @@ def upload_path_handler(instance, filename):
"""
Path handler for dataset file upload.
"""
return "dataset/{id}-{filename}".format(id=instance.pk, filename=filename)
return "{classname}/{id}-{filename}".format(classname=instance.__class__.__name__,
id=instance.pk, filename=filename)
......@@ -73,16 +75,126 @@ class CrossLinker(models.Model):
class FastaDB(models.Model):
"""
This class holds the name of all the FASTA database.
This class holds the name of all the FASTA database
and parsing configuration.
"""
creation_date = models.DateTimeField(auto_now_add=True)
name = models.CharField(max_length=200, unique=True)
file = models.FileField(upload_to=upload_path_handler,
help_text='Select FASTA file.')
identifier_regexp = models.CharField(max_length=50)
gene_name_regexp = models.CharField(max_length=50, blank=True)
description_regexp = models.CharField(max_length=50, blank=True)
species_regexp = models.CharField(max_length=50, blank=True)
parsing_status = models.BooleanField(default=False)
parsing_log = models.CharField(max_length=1000, blank=True, null=True)
sequence_count = models.IntegerField(default=0)
update = models.BooleanField(help_text='Trigger update for fields regexp. Will not stay checked after save.',
default=True)
def refreshParsing(self):
"""
Update features with new regexp.
"""
if self.pk:
parsing_log = ''
for fs in self.fastadb_sequence_set.all():
parsing_log = fs.extractFeatures(self)
fs.save()
if parsing_log == '':
self.parsing_log = 'Ok'
self.parsing_status = True
else:
self.parsing_log = 'Error in: ' + parsing_log,
self.parsing_status = False
self.save()
def __unicode__(self):
return self.name
class Meta:
ordering = ['name']
ordering = ['-creation_date']
class FastaDb_Sequence(models.Model):
"""
This class holds FASTA database protein sequences
and extracted features.
"""
fastadb = models.ForeignKey(FastaDB)
identifier = models.CharField(max_length=50, blank=True)
gene_name = models.CharField(max_length=50, blank=True)
raw_description = models.CharField(max_length=250)
description = models.CharField(max_length=250, blank=True)
species = models.CharField(max_length=100, blank=True)
sequence = models.TextField()
def extractFeatures(self, fastadb_instance):
"""
Applies regexp to extract features from description.
"""
features = ['identifier',
'gene_name',
'description',
'species'
]
no_match = []
for feature in features:
regexp = fastadb_instance.__getattribute__(feature + '_regexp')
if regexp != '':
matchObj = re.search(regexp,
self.raw_description)
if matchObj:
self.__setattr__(feature, matchObj.group(1))
else:
no_match.append(feature)
return ','.join(no_match)
def __unicode__(self):
return self.identifier
......@@ -120,11 +232,16 @@ class Project(AdminURLMixin, models.Model):
This class groups datasets.
"""
creation_date = models.DateTimeField(auto_now_add=True)
name = models.CharField(max_length=250, unique=True)
def __unicode__(self):
return ('[%s] %s') % (self.pk, self.name)
class Meta:
ordering = ['-creation_date']
......@@ -210,33 +327,37 @@ class RawDataset(Dataset):
_UNSAVED_FILEFIELD = 'unsaved_filefield'
_UNSAVED_FILEFIELD_EXTRA = 'unsaved_filefield_extra'
@receiver(pre_save, sender = RawDataset)
@receiver(pre_save)
def skip_saving_file(sender, instance, **kwargs):
"""
Waits that object has been saved before saving files.
"""
if not instance.pk and not hasattr(instance, _UNSAVED_FILEFIELD):
setattr(instance, _UNSAVED_FILEFIELD, instance.file)
instance.file = None
if sender in [RawDataset, FastaDB]:
if not instance.pk and not hasattr(instance, _UNSAVED_FILEFIELD_EXTRA) \
and instance.extra_file:
setattr(instance, _UNSAVED_FILEFIELD_EXTRA, instance.extra_file)
instance.extra_file = None
if not instance.pk and not hasattr(instance, _UNSAVED_FILEFIELD):
setattr(instance, _UNSAVED_FILEFIELD, instance.file)
instance.file = None
if not instance.pk and not hasattr(instance, _UNSAVED_FILEFIELD_EXTRA) \
and hasattr(instance, 'extra_file') and instance.extra_file:
setattr(instance, _UNSAVED_FILEFIELD_EXTRA, instance.extra_file)
instance.extra_file = None
@receiver(post_save, sender = RawDataset)
@receiver(post_save)
def save_file(sender, instance, created, **kwargs):
"""
Saves the files now that the dataset object has been saved.
"""
if created and hasattr(instance, _UNSAVED_FILEFIELD):
instance.file = getattr(instance, _UNSAVED_FILEFIELD)
if created and hasattr(instance, _UNSAVED_FILEFIELD_EXTRA):
instance.extra_file = getattr(instance, _UNSAVED_FILEFIELD_EXTRA)
instance.save()
if sender in [RawDataset, FastaDB]:
if created and hasattr(instance, _UNSAVED_FILEFIELD):
instance.file = getattr(instance, _UNSAVED_FILEFIELD)
if created and hasattr(instance, _UNSAVED_FILEFIELD_EXTRA):
instance.extra_file = getattr(instance, _UNSAVED_FILEFIELD_EXTRA)
instance.save()
@receiver(pre_delete, sender = RawDataset)
......
# # Copyright 2013 Mathieu Courcelles
# # Mike Tyers's lab / IRIC / Universite de Montreal
# Import standard libraries
# Import Django related libraries
# Import project libraries
from ..models import FastaDB
from ..models import FastaDb_Sequence
from parser_generic import InvalidFileFormatException
# Import third-party libraries
from Bio import SeqIO
class FastaParser:
"""
Parse FASTA file and stores entries in database.
"""
@staticmethod
def parseFASTA(fastadb_instance):
"""
Parse FASTA file and stores entries in database.
"""
print "hdfda"
parsing_log = ''
handle = open(fastadb_instance.file.path, "rU")
sequence_count = 0
# Iterate through sequences
for record in SeqIO.parse(handle, "fasta") :
print record.id
print record
print record.seq
fs_instance = FastaDb_Sequence(fastadb = fastadb_instance,
raw_description= record.description,
sequence=record.seq)
parsing_log = fs_instance.extractFeatures(fastadb_instance)
fs_instance.save()
sequence_count += 1
handle.close()
if parsing_log == '' and sequence_count != 0:
FastaDB.objects.filter(pk=fastadb_instance.id).update(
parsing_log = 'Ok',
parsing_status = True,
sequence_count = sequence_count
)
else:
if sequence_count == 0:
parsing_log += ' No protein sequence found!'
FastaDB.objects.filter(pk=fastadb_instance.id).update(
parsing_log = 'Error in: ' + parsing_log,
parsing_status = False,
sequence_count = sequence_count
)
\ No newline at end of file
......@@ -11,6 +11,7 @@
# Import project libraries
class parser_generic:
"""
Parser generic module.
......@@ -32,6 +33,11 @@ class parser_generic:
from pLinkParser import pLinkParser as pLinkParser
pLinkParser.parseResults(instance)
@staticmethod
def parseFASTA(instance):
from FastaParser import FastaParser as FastaParser
FastaParser.parseFASTA(instance)
......
......@@ -113,7 +113,7 @@ class ModelTestCase(TestCase):
d1 = RawDataset.objects.all()[0]
self.assertEqual(upload_path_handler(d1, d1.file), 'dataset/%s-%s' % (d1.pk, d1.file))
self.assertEqual(upload_path_handler(d1, d1.file), 'RawDataset/%s-%s' % (d1.pk, d1.file))
self.assertEqual(d1.get_admin_url(), '/admin/CLMSpipeline_app/rawdataset/%s/' % d1.pk)
self.assertEqual(d1.formated_url(), '<a href="/admin/CLMSpipeline_app/rawdataset/%s/">%s</a><br />' % (d1.pk, d1))
self.assertEqual(d1.formated_url_short(), '<a href="/admin/CLMSpipeline_app/rawdataset/%s/">[%s]</a><br />' % (d1.pk, d1.pk))
......
Install these:
-Python 2.7 (http://www.python.org/getit/)
-MySQL-python-1.2.4.win-amd64-py2.7.exe (http://www.lfd.uci.edu/~gohlke/pythonlibs/)
-biopython-1.62.win-amd64-py2.7.exe (http://www.lfd.uci.edu/~gohlke/pythonlibs/)
Python requirements can be installed this way:
pip install -r requirements/local.txt
......
......@@ -2,4 +2,5 @@ Django==1.5.1
MySQL-python==1.2.4
django-grappelli==2.4.5
South==0.7.5
Unipath==1.0
\ No newline at end of file
Unipath==1.0
biopython==1.62
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment