Seqcomposition.pl
From Organic Design wiki
#!/usr/bin/perl -w
use strict;
use Data::Dumper;
# =============================================================================
# open input pileup file and determine number of sequences in alignment
# =============================================================================
my $debug = 0;
my $handle;
my $dir = ""; # what's that? -its redundant code
my $file = "2002sequences1.msf";
open($handle, "<$dir$file") or die ("Could not open $dir$file $!\n");
my %alignments;
# read data line by line
while(<$handle>) {
if(m/Name: (\w+?)\s/) {
$alignments{$1} = undef;
}
# Populating %alignments values for each alignment key
if(my($name, $seq) = m/^(\w+)\s+(.+)/) { # carrot ^ is extremely important in RegEx
$seq =~ s/ //g;
if(exists $alignments{$name}) {
$alignments{$name} .= $seq;
}
}
}
close($handle);
# Checking hash capture ok
if($debug) {
print Dumper(\%alignments);
}
# Put hash values into baseition structure
my @base;
foreach (keys %alignments) {
my@elements = split(//, $alignments{$_});
for(my$i=0; $i<=$#elements; $i++) {
$base[$i] .= $elements[$i];
}
}
if($debug) {
print Dumper(\@base);
}
my @printorder = ("A%","T%","G%","C%","N%","Total");
{
local $"="\t";
print "Base position\t@printorder\n";
}
my @basecalc;
for(my$i=0; $i<=$#base; $i++) {
$base[$i] =~ tr/atcgn/ATCGN/;
#print $base[$i];
my %basecount = (
A =>0,
C =>0,
T =>0,
G =>0,
N =>0,
Total=>0,
);
while($base[$i] =~ m/([ATCGN])/g) {
$basecount{Total}++; # increment the total number of bases
$basecount{$1}++; # increment the actual bases
}
foreach( keys %basecount ) {
if(m/(\w)/){
$basecount{$1."%"} = $basecount{$1}/$basecount{Total};
}
}
if($debug) {
print Dumper(\%basecount);
}
# Printing output
print $i+1,"\t"; # printing position
foreach(@printorder) {
print "$basecount{$_}\t";
}
print "\n";
}