Seqcomposition.pl

From Organic Design wiki
#!/usr/bin/perl -w

use strict;
use Data::Dumper;

# =============================================================================
#    open input pileup file and determine number of sequences in alignment    
# =============================================================================
my $debug = 0;
my $handle;
my $dir  = ""; # what's that? -its redundant code
my $file = "2002sequences1.msf";
open($handle, "<$dir$file") or die ("Could not open $dir$file $!\n");

my %alignments;

# read data line by line 
while(<$handle>) {
  if(m/Name: (\w+?)\s/) {
    $alignments{$1} = undef;
  }
  
  # Populating %alignments values for each alignment key
  if(my($name, $seq) = m/^(\w+)\s+(.+)/) {  # carrot ^ is extremely important in RegEx
    $seq =~ s/ //g;
    if(exists $alignments{$name}) {
      $alignments{$name} .= $seq;
    }
  }  
}

close($handle);

# Checking hash capture ok
if($debug) {
  print Dumper(\%alignments);
}

# Put hash values into baseition structure

my @base;


foreach (keys %alignments) {
  my@elements = split(//, $alignments{$_});
  for(my$i=0; $i<=$#elements; $i++) {   
    $base[$i] .= $elements[$i];
  }
}

if($debug) {
  print Dumper(\@base);
}

my @printorder = ("A%","T%","G%","C%","N%","Total");
{

  local $"="\t";
  print "Base position\t@printorder\n";  
}  

my @basecalc;
for(my$i=0; $i<=$#base; $i++) {
  $base[$i] =~ tr/atcgn/ATCGN/;
  #print $base[$i];
  my %basecount = (
                    A    =>0,
                    C    =>0,
                    T    =>0,
                    G    =>0,
                    N    =>0,
                    Total=>0,
                   );
  while($base[$i] =~ m/([ATCGN])/g) {
   $basecount{Total}++; # increment the total number of bases
   $basecount{$1}++;    # increment the actual bases
  }

  foreach( keys %basecount ) {
    if(m/(\w)/){
      $basecount{$1."%"} = $basecount{$1}/$basecount{Total};
    }
  }
   if($debug) {
      print Dumper(\%basecount);
  }
  
  # Printing output
  print $i+1,"\t";             # printing position
  foreach(@printorder) {
    print "$basecount{$_}\t";
  }
  print "\n";
 
}