samNormalise.pl 3.96 KB
Newer Older
1
#!/usr/bin/env perl
2 3
use warnings;
use strict;
4

5 6
use Pod::Usage; ## uses pod documentation in usage code
use Getopt::Long qw(:config auto_help pass_through);
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21

my $pos = -1;
my $seqName = "";
my $bestFlags = "";
my $bestID = "";
my $bestSeq = "";
my $bestQual = "";
my $bestLine = "";
my $seenCount = 0;
my $targetCoverage = 100;
my %posReads = ();
my %windowReads = ();
my $covNeeded = -1;
my $output = "sam"; # can be "sam" or "fastq"

22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
our $VERSION = "0.1";

=head1 NAME

samNormalise.pl -- randomly filter mapped reads down to a specific coverage

=head1 SYNOPSIS

samtools view reads.bam | ./samNormalise.pl [options]

=head2 Options

=over 2

=item B<-help>

Only display this help message

=item B<-coverage>

Change target coverage (default: 100)

=item B<-format> (sam|fastq)

Change output format (default: sam)

=back

=head1 DESCRIPTION

Extracts random sequences from those mapped at each base location,
attempting to maintain a minimum coverage threshold where possible.

=cut

GetOptions("format=s" => \$output, "coverage=i" => \$targetCoverage)
  or pod2usage(1);


61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
sub printSeq {
  my ($id, $seq, $qual) = @_;
  if($id){
    printf("@%s\n%s\n+\n%s\n", $id, $seq, $qual);
  }
}

sub dumpSamples {
  my %posReads = %{shift @_};
  if($output eq "fastq"){
    foreach my $pos (keys(%posReads)){
      printSeq($posReads{$pos}{"id"},
               $posReads{$pos}{"seq"},
               $posReads{$pos}{"qual"});
    }
  } elsif($output eq "sam"){
    foreach my $pos (keys(%posReads)){
      print($posReads{$pos}{"line"});
    }
  }
}

sub getMappedLength {
  my $cigar = shift @_;
  my $mappedLen = 0;
  while($cigar =~ s/^([0-9]+)([MIDNSHP=X])//){
    my $subLen = $1;
    my $op = $2;
    if($op =~ /[M=XD]/){
      $mappedLen += $subLen;
    }
  }
  return($mappedLen);
}

while(<>){
  if(/^@/){
    print;
    next;
  }
  my $line = $_;
  chomp;
  my @F = split(/\t/);
  my $currentPos = $F[3];
  if($F[2] ne $seqName){
    ## New reference name, so dump buffered reads
    dumpSamples(\%posReads);
    ## then clear coverage and position buffer
    %posReads = ();
    %windowReads = ();
    $covNeeded = $targetCoverage;
    $seenCount = 0;
  }
  if($currentPos != $pos){
    ## New position, so dump sampled reads from the position buffer (if any)
    dumpSamples(\%posReads);
    ## Add sampled reads from position buffer to window buffer
    foreach my $pos (keys(%posReads)){
      $windowReads{$posReads{$pos}{"id"}}{"pos"} =
        $posReads{$pos}{"pos"};
      $windowReads{$posReads{$pos}{"id"}}{"len"} =
        $posReads{$pos}{"len"};
    }
    ## Clear position buffer
    %posReads = ();
    ## Count up all reads that cover the current location
    $covNeeded = $targetCoverage;
    foreach my $readID (keys(%windowReads)){
      my $currentCov = 0;
      if($windowReads{$readID}{"pos"} +
         $windowReads{$readID}{"len"} < $currentPos){
        ## remove any reads that don't cover the current location
        delete($windowReads{$readID});
      } else {
        ## Read still within range, so reduce needed coverage by 1
        $covNeeded--;
      }
    }
    $seenCount = 0;
  }
  if($covNeeded > 0){
    ## need to add more reads because coverage threshold is below target
    ## or keep the new sequence with probability $covNeeded/$seenCount
    $seenCount++;
    my $posToSwitch = int(rand($seenCount));
    ## keep the first reads up to the coverage limit
    if(scalar(keys(%posReads)) < $covNeeded){
      $posToSwitch = scalar(keys(%posReads));
    }
    ## Note: reads are chosen at random, rather than length-based
    if($posToSwitch < $covNeeded){
      ## Add directly to position buffer, replacing anything there already
      $posReads{$posToSwitch}{"line"} = $line;
      $posReads{$posToSwitch}{"id"} = $F[0];
      $posReads{$posToSwitch}{"seq"} = $F[9];
      $posReads{$posToSwitch}{"pos"} = $F[3];
      $posReads{$posToSwitch}{"len"} = getMappedLength($F[5]);
      $posReads{$posToSwitch}{"qual"} = $F[10];
    }
  }
  $seqName = $F[2];
  $pos = $currentPos;
}
dumpSamples(\%posReads);