KojakParser.py 10.2 KB
Newer Older
1 2
# Copyright 2013-2017 Mathieu Courcelles
# Mike Tyers's lab / IRIC / Universite de Montreal
3 4


5
# Import standard libraries
6
import csv
7
import logging
8 9 10 11 12 13 14
import re

# Import Django related libraries

# Import project libraries
from ..models import CLPeptide
from ..models import RawDataset
15 16
from .exception import InvalidFileFormatException
from .sequences import sequencesMatcher
17

18 19
kojak_version = {
    '1.0-1.5': 'Scan Number\tObs Mass\tCharge\tPSM Mass\tPPM Error\tScore\tdScore\tPep. Diff.\tPeptide #1\tLink #1\tProtein #1\tPeptide #2\tLink #2\tProtein #2\tLinker Mass',
20
    '1.6-dev': 'Scan Number\tRet Time\tObs Mass\tCharge\tPSM Mass\tPPM Error\tScore\tdScore\tPep. Diff.\tPeptide #1\tLink #1\tProtein #1\tPeptide #2\tLink #2\tProtein #2\tLinker Mass',
21 22 23
    '1.6': 'Scan Number\tRet Time\tObs Mass\tCharge\tPSM Mass\tPPM Error\tScore\tdScore\tPeptide #1 Score\tPeptide #1\tLinked AA #1\tProtein #1\tProtein #1 Site\tPeptide #2 Score\tPeptide #2\tLinked AA #2\tProtein #2\tProtein #2 Site\tLinker Mass'
}

24 25 26 27 28

class KojakParser:
    """
    Parser for Kojak results.
    """
29

30 31 32 33 34 35
    @staticmethod
    def parseResults(dataset):
        """
        This method parses Kojak results file provided by the dataset.
        Sucess or failure of this procedure is noted in the dataset object.
        """
36

37 38
        # Prepare sequenceMatcher
        sM = sequencesMatcher(dataset.fasta_db.pk)
39

40
        try:
41
            with open(dataset.file.path, 'r') as f:
42

43 44 45
                # Set run name with file name
                run_name = dataset.file.path
                run_name = re.sub('(^.+[/\\\]\d+-)', '', run_name)
46
                run_name = run_name.replace('.kojak.txt', '')
47 48 49

                # Check header to validate file format
                line = f.readline().rstrip('\n\r')
50

51 52 53
                # Is this file merged?
                if line.startswith('run_name='):
                    run_name = line.split('=')[1]
54 55 56
                    line = f.readline().rstrip('\n\r')

                    # Check header to validate file format
Mathieu Courcelles's avatar
Mathieu Courcelles committed
57
                header_template = 'Kojak version'
58 59

                if not line.startswith(header_template):
60
                    raise InvalidFileFormatException(
61 62 63
                        'Uploaded extra file is not recognized as a '
                        'Kojak results file (bad file header row 1)')

64 65
                # Create CSV reader
                reader = csv.DictReader(f, delimiter='\t')
66

67 68
                header_file = '\t'.join(name for name in reader.fieldnames)

69
                version = None
70

71
                for item_version, header in kojak_version.items():
72

73 74 75 76 77 78 79
                    if header == header_file:
                        version = item_version
                        break
                else:
                    raise InvalidFileFormatException(
                        'Uploaded extra file is not recognized as a Kojak '
                        'results file (bad file header row)')
80 81 82

                protpos_pattern = re.compile('(^.+)\((\d+)\)')

83
                pep_pos_dict = dict()
84

85
                clpep_list = []
86 87 88 89 90 91 92 93
                through_list = []

                # import time
                # start_time = time.time()

                # next_id = CLPeptide.objects.latest('id').id + 1


94
                # Access the through model directly
95 96 97 98
                # ThroughModel = CLPeptide.dataset.through


                # Iterate through lines and create clPeptide
99
                for row in reader:
100

101 102 103 104
                    # Is this file merged?
                    if row['Scan Number'].startswith('run_name='):
                        run_name = row['Scan Number'].split('=')[1]
                        continue
105

106 107
                    if row['Scan Number'].startswith('Scan') or row['Scan Number'].startswith('Kojak'):
                        continue
108

109 110 111
                    # Discard scan without match
                    if row['Charge'] == '0':
                        continue
112

113 114
                    # Format each field to the right format for the data model
                    fields = dict()
115

116 117
                    fields['run_name'] = run_name
                    fields['scan_number'] = row['Scan Number']
118
                    charge = float(row['Charge'])
119
                    fields['precursor_mz'] = '%4f' % ((float(row['Obs Mass']) + charge * 1.00727646677) / charge)
120 121 122 123 124 125 126 127
                    fields['precursor_charge'] = row['Charge']
                    fields['precursor_intensity'] = -1
                    fields['rank'] = -1
                    fields['match_score'] = row['Score']
                    fields['spectrum_intensity_coverage'] = -1
                    fields['total_fragment_matches'] = -1
                    fields['delta'] = row['dScore']
                    fields['error'] = row['PPM Error']
128

129
                    row['Peptide #1'] = row['Peptide #1'].replace('-15N', '')
130 131
                    fields['peptide1'] = row['Peptide #1']
                    fields['peptide_wo_mod1'] = re.sub('(\[.+?\])', '', row['Peptide #1'])
132

133
                    fields['peptide_position1'] = -1
134

135
                    row['Peptide #2'] = row['Peptide #2'].replace('-15N', '')
136 137 138
                    fields['peptide2'] = row['Peptide #2']
                    fields['peptide_wo_mod2'] = re.sub('(\[.+?\])', '', row['Peptide #2'])
                    fields['peptide_position2'] = -1
139

140 141 142 143 144 145 146 147 148 149 150
                    if version == '1.6':
                        fields['pep1_link_pos'] = int(row['Linked AA #1'])
                        fields['pep2_link_pos'] = int(row['Linked AA #2'])

                        fields['n_score_a'] = float(row['Peptide #1 Score'])
                        fields['n_score_b'] = float(row['Peptide #2 Score'])

                    else:
                        fields['pep1_link_pos'] = int(row['Link #1'])
                        fields['pep2_link_pos'] = int(row['Link #2'])

151 152 153
                    if 'Ret Time' in row:
                        fields['retention_time'] = float(row['Ret Time']) * 60

154 155 156
                    fields['display_protein1'] = row['Protein #1'].split(';')[0]
                    fields['display_protein1'] = re.sub('(^>)', '', fields['display_protein1'])
                    result = protpos_pattern.match(fields['display_protein1'])
157

158 159
                    if result:
                        fields['display_protein1'] = result.group(1)
Mathieu Courcelles's avatar
Mathieu Courcelles committed
160
                        fields['peptide_position1'] = int(result.group(2)) - int(fields['pep1_link_pos']) + 1
161

162 163 164
                    fields['display_protein2'] = row['Protein #2'].split(';')[0]
                    fields['display_protein2'] = re.sub('(^>)', '', fields['display_protein2'])
                    result = protpos_pattern.match(fields['display_protein2'])
165

166 167
                    if result:
                        fields['display_protein2'] = result.group(1)
168 169
                        fields['peptide_position2'] = int(result.group(2)) - int(fields['pep2_link_pos']) + 1

170

171 172 173 174 175 176 177 178 179 180 181

                    fields['autovalidated'] = False
                    fields['validated'] = ''
                    fields['rejected'] = False
                    fields['notes'] = ''
                    fields['not_decoy'] = True
                    if 'decoy' in row['Protein #1'] or 'decoy' in row['Protein #2']:
                        fields['not_decoy'] = False

                    # Create the CLPeptide object
                    clpep = CLPeptide(**fields)
182

183 184 185
                    # Match protein sequences
                    clpep.fs_prot1_id = sM.sequencePk(clpep.display_protein1)
                    clpep.fs_prot2_id = sM.sequencePk(clpep.display_protein2)
186

187 188
                    # Find peptide position for linear peptide
                    if fields['peptide_position1'] == -1 and clpep.fs_prot1_id is not None:
189

190
                        key = f'{fields["peptide_wo_mod1"]}-{clpep.fs_prot1_id.pk}'
191

192 193
                        if key in pep_pos_dict:
                            clpep.peptide_position1 = pep_pos_dict[key]
194

195 196
                        else:
                            clpep.peptide_position1 = clpep.fs_prot1_id.sequence.find(fields['peptide_wo_mod1']) + 1
197
                            pep_pos_dict[key] = clpep.peptide_position1
198

199
                        # Check DSS dead end - this could be improved
200 201
                        pepseq = re.sub('(K\[.+?\])', 'k', row['Peptide #1'])
                        pepseq = re.sub('(\[.+?\])', '', pepseq)
202

203
                        pos = pepseq.find('k')
204

205 206
                        if pos > -1:
                            clpep.pep1_link_pos = pos + 1
207

208 209
                    if fields['peptide_position2'] == -1 and clpep.fs_prot2_id is not None:

210
                        key = f'{fields["peptide_wo_mod2"]}-{clpep.fs_prot2_id.pk}'
211 212 213 214 215 216 217 218

                        if key in pep_pos_dict:
                            clpep.peptide_position2 = pep_pos_dict[key]

                        else:
                            clpep.peptide_position2 = clpep.fs_prot2_id.sequence.find(fields['peptide_wo_mod2']) + 1
                            pep_pos_dict[key] = clpep.peptide_position2

219 220
                    if clpep.peptide_position2 == -1 and fields['pep2_link_pos'] != -1:
                        clpep.peptide_position2 = clpep.peptide_position1
221

222
                    clpep.guessLinkType()
223
                    # clpep.id = next_id
224 225 226 227

                    # Save object to db
                    clpep.save()
                    clpep.dataset.add(dataset.pk)
228

229 230 231 232
                    # clpep_list.append(clpep)
                    # through_list.append(ThroughModel(clpeptide_id= next_id, dataset_id=dataset.pk))
                    # next_id += 1

233
                # Save object to db
234 235 236
                # CLPeptide.objects.bulk_create(clpep_list, batch_size=999)
                # clpep.save()

237
                # Link object to dataset
238 239
                # ThroughModel.objects.bulk_create(through_list, batch_size=999)

240
                # Append filter string to dataset description
241 242
                RawDataset.objects.filter(pk=dataset.id).update(
                    parsing_log='Ok', parsing_status=True)
243

244
        except (InvalidFileFormatException, ValueError) as e:
245
            RawDataset.objects.filter(pk=dataset.id).update(
246 247 248 249 250 251
                parsing_status=False, parsing_log='Error: ' + e.value)

            msg = 'Error while parsing Kojak results.'

            logger = logging.getLogger('django')
            logger.error(msg, exc_info=True)