DatasetProcessing.py 13.6 KB
Newer Older
1
## Copyright 2013-2017 Mathieu Courcelles
2 3 4
## Mike Tyers's lab / IRIC / Universite de Montreal 

# Import standard librariesdjang
5 6 7
import subprocess
import os
import tempfile
8 9

# Import Django related libraries
10 11 12
from django.conf import settings
from django.contrib import messages
from django.db import transaction
13
from django.db.models import Q
14 15
from django.utils.safestring import mark_safe

16 17

# Import project libraries
18 19 20 21
from .models import (CLPeptide, 
                    ProcessedDataset,
                    QuantificationFC,
                    )
22

23
from .queryset_operation import (dataset_set_2_clpeptide_set,
24 25
                                )

26
import CLMSVault.CLMSVault_app.export as export
27 28


29 30

class DatasetProcessing:
Mathieu Courcelles's avatar
Mathieu Courcelles committed
31 32 33 34 35
    """
    This class process datasets of cross-linked peptides.
    It groups multiple datasets in a single one and filter
    cross-linked peptides.
    """
36 37
    
    @staticmethod
38
    @transaction.atomic
39
    def process(instance):
Mathieu Courcelles's avatar
Mathieu Courcelles committed
40 41 42 43
        """
        Groups multiple datasets in a single one and filter
        cross-linked peptides.
        """
44 45 46 47 48 49 50 51 52 53 54 55
        
        datasets = instance.datasets.all()

        # Transfer extra info
        fields = dict()
        fields['cross_linker'] = ''
        fields['instrument_name'] = ''
        fields['fasta_db'] = ''
        fields['search_algorithm'] = ''
        dataset_description = ''

        
56
        # Iterates selected datasets to get information
57 58 59 60
        for dataset in datasets:
            
            dataset_description += dataset.description
            
61 62 63 64 65
            # Check extra info homogenity
            for field in fields:
                if fields[field] == '':
                    fields[field] = dataset.__getattribute__(field)
                elif fields[field] != dataset.__getattribute__(field):
Mathieu Courcelles's avatar
Mathieu Courcelles committed
66
                    fields[field] = None
67 68
                    
        # Update extra info in database
69
        fields['description'] = instance.description + dataset_description
70
        ProcessedDataset.objects.filter(pk=instance.id).update(**fields)
71 72
            
            
73 74 75 76
        # Take care first of CLPeptides
        datasets_list = list(datasets.values_list('pk', flat=True))
        clpeptide_set = CLPeptide.objects.filter(dataset__in=datasets_list)
        
77
        # Filter for quantification
78
        if instance.clpeptidefilter is not None and instance.clpeptidefilter.quantification_experiment is not None:
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
            qfcs = QuantificationFC.objects.filter(quantification_id=instance.clpeptidefilter.quantification_experiment)
            fc = dict()
            
            if instance.clpeptidefilter.quantification_value_1 is not None:
                fc['__'.join(['fold_change', instance.clpeptidefilter.quantification_operator_1])] = instance.clpeptidefilter.quantification_value_1
                
            if instance.clpeptidefilter.quantification_value_2 is not None:
                fc['__'.join(['fold_change', instance.clpeptidefilter.quantification_operator_2])] = instance.clpeptidefilter.quantification_value_2
            
            if len(fc) == 1:
                qfcs = qfcs.filter(**fc)
            
            if len(fc) == 2:
                keys = fc.keys()
                
                if instance.clpeptidefilter.quantification_logic == '|':
                    qfcs = qfcs.filter(Q( **{keys[0]: fc[keys[0]]}) | Q(**{keys[1]: fc[keys[1]]})  )
                elif instance.clpeptidefilter.quantification_logic == '&':
                    qfcs = qfcs.filter(Q( **{keys[0]: fc[keys[0]]}) & Q(**{keys[1]: fc[keys[1]]})  )
                    
            
            quant_clpeptide_pk_list = list(qfcs.values_list('clpeptide', flat=True))
            clpeptide_set = clpeptide_set.filter(pk__in=quant_clpeptide_pk_list)

        
104 105 106 107 108 109
        
        # Apply/prepare filter/exclude to CLPeptides
        fp_msrun = dict()
        unique_msrun_pep = dict()
        unique_key = dict()
        
110 111 112 113 114 115 116 117 118 119 120 121
        if instance.clpeptidefilter:
            for fe in instance.clpeptidefilter.clpeptidefilterparam_set.all():

                d = dict()
                d['__'.join([fe.field, fe.field_lookup])] = fe.value

                if fe.method == 'filter':
                    clpeptide_set = clpeptide_set.filter(**d)
                else:
                    clpeptide_set = clpeptide_set.exclude(**d)
        
        
122 123
            # Determine false positive cutoff score
            if instance.clpeptidefilter.fp_cutoff is not None:
124
                
125
                fp_msrun['dataset'] = [0.0, 0.0, 100000]
126
                
127
                for clpep in clpeptide_set:
128
                    
129 130 131
                    clpep_count = 0.0;
                    clpep_decoy_count = 0.0;
                    min_score = 100000;
132
                
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
                    if clpep.run_name in fp_msrun:
                        clpep_count, clpep_decoy_count, min_score  = fp_msrun[clpep.run_name]
                    
                    clpep_count += 1
                    fp_msrun['dataset'][0] += 1
                    
                    if clpep.not_decoy == False:
                        clpep_decoy_count += 1
                        fp_msrun['dataset'][1] += 1
                    
                    if (clpep_decoy_count / clpep_count) <= instance.clpeptidefilter.fp_cutoff:
                        min_score = clpep.match_score
                        
                    if (fp_msrun['dataset'][1] / fp_msrun['dataset'][0]) <= instance.clpeptidefilter.fp_cutoff:
                        fp_msrun['dataset'][2] = clpep.match_score
                    
                    # Store value for next iteration
                    fp_msrun[clpep.run_name] = [clpep_count, clpep_decoy_count, min_score]
Mathieu Courcelles's avatar
Mathieu Courcelles committed
151

152
                
153 154 155 156 157
            # Unique peptide filter
            if instance.clpeptidefilter.unique_key is not None:
                
                for key in instance.clpeptidefilter.unique_key.split('-'):
                    unique_key[key] = True;        
158 159 160 161
        
        
        # Add peptides to the new dataset           
        for clpep in clpeptide_set:
Mathieu Courcelles's avatar
Mathieu Courcelles committed
162
            
163 164
            # Apply filter
            if instance.clpeptidefilter is not None:
165
            
166 167 168 169 170 171 172 173 174 175
                # K-K link filter
                #print instance.clpeptidefilter.KK_link
                
                if instance.clpeptidefilter.KK_link:
                    
                    if clpep.pep1_link_pos == 1:
                        continue
                    
                    if clpep.pep2_link_pos == 1:
                        continue
176 177 178 179 180
                    
                    index1 = clpep.pep1_link_pos
                    p1 = clpep.peptide_wo_mod1
                    index2 = clpep.pep2_link_pos
                    p2 = clpep.peptide_wo_mod2
181 182 183 184 185 186 187 188 189 190 191 192 193 194
                    
                    aa1 = clpep.peptide_wo_mod1[clpep.pep1_link_pos - 1]
                    
                    if aa1 != 'K':
                        continue
                    
                    aa2 = clpep.peptide_wo_mod2[clpep.pep2_link_pos - 1]
                    
                    #print ('%s-%s') % (aa1, aa2)
                    #print aa1 != aa2
                    
                    if aa2 != 'K':
                        continue
            
195 196
                # Skip decoy if requested
                if instance.clpeptidefilter.remove_decoy and clpep.not_decoy == False:
197
                    continue
198 199 200 201
            
                # Test peptide uniqueness
                msrun = clpep.run_name
                key = ''
202
                
203
                if instance.clpeptidefilter.unique_key is not None:
Mathieu Courcelles's avatar
Mathieu Courcelles committed
204
                    
205 206
                    if instance.clpeptidefilter.unique_in == 'dataset':
                        msrun = 'Dataset'
Mathieu Courcelles's avatar
Mathieu Courcelles committed
207
                    
208 209 210 211 212
                    # Skip peptide if already seen
                    key = clpep.uniqueKey(unique_key)
    
                    if not msrun in unique_msrun_pep:
                        unique_msrun_pep[msrun] = dict()
Mathieu Courcelles's avatar
Mathieu Courcelles committed
213
                
214 215 216 217
                    if key in unique_msrun_pep[msrun]:
                        continue
                    else:
                        unique_msrun_pep[msrun][key] = True
Mathieu Courcelles's avatar
Mathieu Courcelles committed
218
                
219 220 221 222 223 224
                    
                # Limit false positives
                if instance.clpeptidefilter.fp_cutoff is not None:
                    
                    if instance.clpeptidefilter.unique_in == 'msrun':
                        if clpep.match_score < fp_msrun[clpep.run_name][2]:
225
                            continue
226 227 228
                    else:
                        if clpep.match_score < fp_msrun['dataset'][2]:
                            break
229 230 231
                    
            # Add peptide to dataset
            clpep.dataset.add(instance)
232 233 234 235 236 237
            #clpep.save()
            
            

    @staticmethod
    @transaction.atomic
238
    def run_Percolator(self, request, queryset, form):
239 240 241 242 243 244
        """
        Run Percolator on a dataset and create a new Processed dataset
        """
        

        # Write cross-links to Percolator TSV format
245
        temp_fh = tempfile.NamedTemporaryFile(delete=False, mode='w')
246 247 248

        clpeptide_set = dataset_set_2_clpeptide_set(queryset)
        
249 250 251 252 253 254 255 256
        
        # Filter cross-link type
        if form.cleaned_data['cl_type'] == 'IntraInter':
            clpeptide_set = clpeptide_set.filter(cross_link=True).exclude(link_type='Intra-peptide')
        elif form.cleaned_data['cl_type'] == 'Intra':
            clpeptide_set = clpeptide_set.filter(link_type='Intra-protein')
        elif form.cleaned_data['cl_type'] == 'Inter':
            clpeptide_set = clpeptide_set.filter(link_type='Inter-protein')
257 258

        export.make_Percolator_tsv(self, request, clpeptide_set, 
259
                              'Dataset_CLPeptide', fh=temp_fh, nv=form.cleaned_data['nv'])
260 261 262 263
        temp_fh.close()
        
        
        
264 265 266 267
        # Get q-value
        q_value = form.cleaned_data['q_value']
 
        
268 269 270
        # Run Percolator
        out_file = temp_fh.name + '_perco'
        
271 272 273 274 275
        p = subprocess.Popen('"%s" -t %s -F %s -U "%s" > %s' % (settings.PERCOLATOR_BIN, 
                                                          q_value,
                                                          q_value,
                                                          temp_fh.name, 
                                                          out_file), 
276 277 278 279 280 281 282
                             shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        retval = p.wait()
    
        
        
        
        # Check if Percolator had a problem
283 284 285 286

        lines = [x.decode('utf8', 'ignore') for x in p.stdout.readlines()]
        percolator_out = '<br />'.join(lines)

287 288 289 290 291 292 293
        if retval != 0:
            messages.add_message(request, messages.ERROR, mark_safe(percolator_out))
        else:
            messages.add_message(request, messages.INFO, mark_safe(percolator_out))
            
            # Read CLpeptideId
            clpeptideid_dict  = dict()
294
            
295 296
            f = open(out_file, 'r')
            f.readline()  # Skip header
297
            
298 299 300 301 302
            
            for line in f:
                fields = line.split('\t')
                
                # Apply q-value cut-off
303
                if float(fields[2]) <= q_value:
304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333
                    clpeptideid_dict[int(fields[0])] = True
                
            f.close()

            # Create processed dataset
            datasets = queryset
            datasets_pk = []

            # Transfer extra info
            fields = dict()
            fields['cross_linker'] = ''
            fields['instrument_name'] = ''
            fields['fasta_db'] = ''
            fields['name'] = ''
            fields['description'] = ''
            fields['project_id'] = ''
            fields['search_algorithm'] = ''    
            
            # Iterates selected datasets to get information
            for dataset in datasets:
                
                datasets_pk.append(dataset.pk)
                
                # Check extra info homogenity
                for field in fields:
                    if fields[field] == '':
                        fields[field] = dataset.__getattribute__(field)

                        
            # Update extra info in database
334 335 336
            fields['name'] += ' (percolated: q-value <=%s, cl-type=%s, nv=%s)' % (q_value, 
                                                                                  form.cleaned_data['cl_type'],
                                                                                  form.cleaned_data['nv'])
337 338 339 340 341 342 343 344 345 346 347 348 349
            fields['description'] += percolator_out.replace('<br />', '\n')
            
            pdataset = ProcessedDataset(**fields)
            pdataset.save()
            
            # Add original dataset
            for pk in datasets_pk:
                pdataset.datasets.add(pk)
                
            # Add CLpeptide to dataset
            
            ThroughModel = CLPeptide.dataset.through
            through_list = []
350
            key_check = dict()
351 352 353 354 355
            
            for clpep in clpeptide_set:
                
                if clpep.pk in clpeptideid_dict:
                    
356 357 358 359 360 361
                    key = '%s-%s' % (clpep.pk, pdataset.pk)
                    
                    if key not in key_check:
                    
                        through_list.append(ThroughModel(clpeptide_id= clpep.pk, dataset_id=pdataset.pk))
                        key_check[key] = True
362 363 364 365 366 367 368 369 370 371 372 373 374 375

            # Link CLPetide object to dataset
            ThroughModel.objects.bulk_create(through_list)


            messages.add_message(request, messages.INFO, mark_safe('A new Processed dataset was created with Percolator results: ' + pdataset.formated_url()))
                    
            

        # Remove temporary file
        os.remove(temp_fh.name)

        if os.path.isfile(out_file):
            os.remove(out_file)