#!/usr/bin/perl -w
#msastats.pl

use strict;
use Class::Struct;

use vars qw ($opt_n $opt_p);  # required if strict used
use Getopt::Std;

getopts ('np:');


# Print a helpful message if the user provides no input file.
if (!@ARGV) {
        print "usage:  msastats.pl [options] msafile\n\n";
        print "options:\n";
        print "-n:        normalize the names if identical\n";
        print "-p <num>:  print the alignment <num>\n";
       exit;
}
my $msafile = shift;

my $ave_id = 0.0;
my $std_id = 0.0;
my $ave_mut = 0.0;
my $std_mut = 0.0;
my $ave_indl = 0.0;
my $std_indl = 0.0;
my $ave_fo = 0.0;
my $std_fo = 0.0;

my $verbose = 0;

get_stats_msa($msafile, 
	      \$ave_id,   \$std_id, 
	      \$ave_mut,  \$std_mut, 
	      \$ave_indl, \$std_indl, 
	      \$ave_fo,    \$std_fo);

printf "\nSUMMARY STATS\n";
printf "pairwise ID:\t%.2f +/- %.2f\n", $ave_id, $std_id;
printf "pairwise MUT:\t%.2f +/- %.2f\n", $ave_mut, $std_mut;
printf "pairwise INDEL:\t%.2f +/- %.2f\n", $ave_indl, $std_indl;
printf "indel freq:\t%.5f +/- %.5f\n", $ave_fo, $std_fo;


# subroutines
sub accumulate_averages {
    my ($val, $meanval_ref, $meansquareval_ref) = @_;

    $$meanval_ref       += $val;
    $$meansquareval_ref += $val*$val;
}

sub calculate_averages {
    my ($meanval_ref, $stdval_ref, $number) = @_;

    my $mean = $$meanval_ref;
    my $std  = $$stdval_ref;

    if ($number > 1) {
	$mean /= $number;
	
	$std -= $mean*$mean*$number;
	$std /= ($number-1);
	$std  = sqrt($std);
    }
    elsif ($number == 1) {
	$mean /= $number;
	$std   = 0.0;
    }
    else {
	$mean = 0.0;
	$std  = 0.0;
    }

    $$meanval_ref = $mean;
    $$stdval_ref  = $std;

}

sub gather_stats {
    my ($nali, $ali, $aseq, $alen, 
	$msa_ref, $name_ref,
	$ave_id_ref,   $std_id_ref, 
	$ave_mut_ref,  $std_mut_ref, 
	$ave_indl_ref, $std_indl_ref, 
	$ave_fo_ref,   $std_fo_ref) = @_;

    my $sqlen_geomean = 0.0;
    my $sqlen_arimean = 0.0;
    my $sqlen_geostd  = 0.0;
    my $sqlen_aristd  = 0.0;
    my $meanpairid   = 0.0;
    my $stdpairid    = 0.0;
    my $meanpairmut  = 0.0;
    my $stdpairmut   = 0.0;
    my $meanpairindl = 0.0;
    my $stdpairindl  = 0.0;
    my $indelfreq  = 0.0;

    my $afreq  = 0.0;
    my $cfreq  = 0.0;
    my $gfreq  = 0.0;
    my $tfreq  = 0.0;
     my $nfreq  = 0.0;
    my $phylip_afreq  = 0.0;
    my $phylip_cfreq  = 0.0;
    my $phylip_gfreq  = 0.0;
    my $phylip_tfreq  = 0.0;
    my $phylip_nfreq  = 0.0;
    my $counts;
    my $phylip_counts = 0;

    #check the alignment is complete
    for (my $s = 0; $s < $aseq; $s ++) { 
	if (length($msa_ref->[$s]) != $alen) { 
	    print "bad alignment; seq $s has len ", length($msa_ref->[$s]),". alignment len is $alen\n$msa_ref->[$s]\n"; die;
	} 
    }

    # indel/residue statistics
    # indels use - only
    for (my $s = 0; $s < $aseq; $s ++) { 
	my $indel  = ($msa_ref->[$s] =~ tr/\-/\-/);
	my $acount = ($msa_ref->[$s] =~ tr/A/A/);
	my $ccount = ($msa_ref->[$s] =~ tr/C/C/);
	my $gcount = ($msa_ref->[$s] =~ tr/G/G/);
	my $tcount = ($msa_ref->[$s] =~ tr/T/T/); $tcount += ($msa_ref->[$s] =~ tr/U/U/);
	my $ncount = ($msa_ref->[$s] =~ tr/N/N/);

	#calcualte geometric and arithmetic mean of sqlen
	my $sqlen = $alen - $indel;
	accumulate_averages($sqlen,                           \$sqlen_arimean, \$sqlen_aristd);
	accumulate_averages(($sqlen > 0)? log($sqlen):-99999, \$sqlen_geomean, \$sqlen_geostd);
	
	$indelfreq += $indel; 

	$afreq += $acount; 
	$cfreq += $ccount; 
	$gfreq += $gcount; 
	$tfreq += $tcount; 
	$nfreq += $ncount; 

	$phylip_afreq += $acount + $indel; 
	$phylip_cfreq += $ccount + $indel; 
	$phylip_gfreq += $gcount + $indel; 
	$phylip_tfreq += $tcount + $indel; 
	$phylip_nfreq += $ncount + $indel; 
	
	$counts        += $acount + $ccount + $gcount + $tcount + $ncount + $indel;   
	$phylip_counts += $acount + $ccount + $gcount + $tcount + 4*$indel;   
    }     

    calculate_averages(\$sqlen_arimean, \$sqlen_aristd, $aseq);
    calculate_averages(\$sqlen_geomean, \$sqlen_geostd, $aseq);
    $sqlen_geomean = exp($sqlen_geomean);
    $sqlen_geostd  = exp($sqlen_geostd);

    if ($counts > 0) { 
	$indelfreq    /= $counts; 
	$afreq        /= $counts;  
	$cfreq        /= $counts; 
	$gfreq        /= $counts; 
	$tfreq        /= $counts; 
	$nfreq        /= $counts; 
    }
    if ($phylip_counts > 0) { 
	$phylip_afreq /= $phylip_counts;  
	$phylip_cfreq /= $phylip_counts; 
	$phylip_gfreq /= $phylip_counts; 
	$phylip_tfreq /= $phylip_counts; 
	$phylip_nfreq /= $phylip_counts; 
    }

    if (1.0-$indelfreq > 0 ) {
	$afreq /= (1.0-$indelfreq);  
	$cfreq /= (1.0-$indelfreq); 
	$gfreq /= (1.0-$indelfreq); 
	$tfreq /= (1.0-$indelfreq); 
 	$nfreq /= (1.0-$indelfreq); 
    }
    
    # pairwise identity statistics
    my $ncases = 0;
    for (my $s1 = 0; $s1 < $aseq; $s1 ++) { 
	for (my $s2 = $s1+1; $s2 < $aseq; $s2 ++) {
	    my $len = $alen;
	    my $pairid   = 0.0;
	    my $pairmut  = 0.0;
	    my $pairindl = 0.0;
	    $ncases ++;

	    for (my $x = 0.; $x < $alen; $x ++) {
		my $a = substr($msa_ref->[$s1], $x, 1);
		my $b = substr($msa_ref->[$s2], $x, 1);
		if ($a eq $b) {
		    if ($a eq "-") { $len --;    }
		    else           { $pairid ++; }
		}
		else {
		    if ($a eq "-" || $b eq "-") { $pairindl ++; }
		    else                        { $pairmut ++;  }
		}
	    }
	    if ($len > 0) { 
		$pairid   /= 0.01*$len; 
		$pairmut  /= 0.01*$len; 
		$pairindl /= 0.01*$len; 
	    }

	    accumulate_averages($pairid,   \$meanpairid,   \$stdpairid);	    
	    accumulate_averages($pairmut,  \$meanpairmut,  \$stdpairmut);	    
	    accumulate_averages($pairindl, \$meanpairindl, \$stdpairindl);	    
	}  
    }
    calculate_averages(\$meanpairid,   \$stdpairid,   $ncases);
    calculate_averages(\$meanpairmut,  \$stdpairmut,  $ncases);
    calculate_averages(\$meanpairindl, \$stdpairindl, $ncases);

    printf "\nALI NUM:\t%d\n", $nali;
    printf "Number of sequences:\t%d\n", $aseq;
    printf "Len alignment:\t%d\n", $alen;
    printf "seqs geometric mean:\t%.2f +/- %.2f\n", $sqlen_geomean, $sqlen_geostd;
    printf "seqs arithmetic mean:\t%.2f +/- %.2f\n", $sqlen_arimean, $sqlen_aristd;
    printf "pairwise ID:\t%.2f +/- %.2f\n", $meanpairid, $stdpairid;
    printf "pairwise MUT:\t%.2f +/- %.2f\n", $meanpairmut, $stdpairmut;
    printf "pairwise INDEL:\t%.2f +/- %.2f\n", $meanpairindl, $stdpairindl;
    printf "indel freq:\t%f\n", $indelfreq;
    printf "resid freq: A\t%f\n", $afreq;
    printf "resid freq: C\t%f\n", $cfreq;
    printf "resid freq: G\t%f\n", $gfreq;
    printf "resid freq: T\t%f\n", $tfreq;
    printf "resid freq: N\t%f\n", $nfreq;
    printf "phylip_resid freq: A\t%f\n", $phylip_afreq;
    printf "phylip_resid freq: C\t%f\n", $phylip_cfreq;
    printf "phylip_resid freq: G\t%f\n", $phylip_gfreq;
    printf "phylip_resid freq: T\t%f\n", $phylip_tfreq;
    printf "phylip_resid freq: N\t%f\n", $phylip_nfreq;

    if ($opt_p && $opt_p == $nali) { write_msa ($aseq, $msa_ref, $name_ref); }

    accumulate_averages($meanpairid,   $ave_id_ref,   $std_id_ref);	    
    accumulate_averages($meanpairmut,  $ave_mut_ref,  $std_mut_ref);	    
    accumulate_averages($meanpairindl, $ave_indl_ref, $std_indl_ref);	    
    accumulate_averages($indelfreq,    $ave_fo_ref,   $std_fo_ref);	    
        
 } 

sub get_stats_msa {
    my ($phylipmsafile, 
	$ave_id_ref,   $std_id_ref, 
	$ave_mut_ref,  $std_mut_ref, 
	$ave_indl_ref, $std_indl_ref, 
	$ave_fo_ref,   $std_fo_ref) = @_;
    
    my $ave_id   = 0.0;
    my $std_id   = 0.0;
    my $ave_mut  = 0.0;
    my $std_mut  = 0.0;
    my $ave_indl = 0.0;
    my $std_indl = 0.0;
    my $ave_fo   = 0.0;
    my $std_fo   = 0.0;

    my $ali;

    my @name;
    my @msa;
    my $aseq;
    my $alen;
    my $n = 0;

    my $nali = 0;
    my $isfirstblock;
    my $name;

    open(MSA, $phylipmsafile);
    while(<MSA>) {
	if (/\s*(\d+)\s+(\d+)/) {
	    $nali ++;
	    my $line = $_;
	    $isfirstblock = 1;
	    $aseq = $1;
	    $alen = $2;

	    if ($nali > 1) { gather_stats($nali-1, $ali, 
					  $aseq, $alen, 
					  \@msa, \@name,
					  \$ave_id,   \$std_id, 
					  \$ave_mut,  \$std_mut, 
					  \$ave_indl, \$std_indl, 
					  \$ave_fo,   \$std_fo); }
	    
	    #initialize
	    $ali = $line;
	    for (my $s = 0; $s < $aseq; $s ++) { $msa[$s] = ""; }
	}
	elsif (/\s*\S+/) {
	    my $line = $_;
	    $ali .= $line;

	    if ($isfirstblock) {
		if (/^\s*(\S+)\s+(.+)/) {
		    $name = $1;
		    $line = $2; 
		}
		else {
		    $line =~ s/^(.{10})//; #names in phylip format have at most 10 characters
		    $name = $1;
		}
		$name[$n] = $name; 
	    }
	     
	    $line =~ s/ //g;
	    $line =~ s/\n//g;

	    $msa[$n++] .= $line; 
	   
	}
	elsif (/^\s+$/) {
	    my $line = $_;
	    $ali .= $line;
	    $isfirstblock = 0;
	    if ($n != $aseq) { print "bad alignment\n"; die; }
	    $n = 0;
	}
    }
    close (MSA);

    gather_stats($nali, $ali, 
		 $aseq, $alen, 
		 \@msa, \@name,
		 \$ave_id,   \$std_id, 
		 \$ave_mut,  \$std_mut, 
		 \$ave_indl, \$std_indl, 
		 \$ave_fo,   \$std_fo);
    
    calculate_averages(\$ave_id,   \$std_id,   $nali);
    calculate_averages(\$ave_mut,  \$std_mut,  $nali);
    calculate_averages(\$ave_indl, \$std_indl, $nali);
    calculate_averages(\$ave_fo,   \$std_fo,   $nali);

    $$ave_id_ref   = $ave_id;
    $$std_id_ref   = $std_id;
    $$ave_mut_ref  = $ave_mut;
    $$std_mut_ref  = $std_mut;
    $$ave_indl_ref = $ave_indl;
    $$std_indl_ref = $std_indl;
    $$ave_fo_ref   = $ave_fo;
    $$std_fo_ref   = $std_fo;
}

# In Phylip format the names of the
# sequences have to be at most of length 10.
# Here we truncate names if necesary, and
# make sure that all names are different at that length
sub name_normalize {
    my ($nseq, $name_ref) = @_;

    my $maxlen = 10;
    
    if ($verbose) {
	for (my $n = 0; $n < $nseq; $n ++) {
	    print "OLD: $name_ref->[$n]\n";
	}
    }
    
    # Make names of length maxlen (10) either 
    # by truncating it, or by adding spaces
    for (my $n = 0; $n < $nseq; $n ++) {
	
	# truncate longer names
	if ($name_ref->[$n] =~ /^(.{$maxlen})/) {
	    $name_ref->[$n] = $1;
	}
	
	# add spaces for shorter names
	while (length($name_ref->[$n]) < $maxlen) {
	    $name_ref->[$n] .= " ";
	}
    }
    
    # After truncation, some names might be identical.
    # Change identical names by adding a counter.
    for (my $n = 0; $n < $nseq; $n ++) {
	my $c = 1;
	for (my $m = $n+1; $m < $nseq; $m ++) {
	    if ($name_ref->[$n] eq $name_ref->[$m]) {

		# change the first name
		if ($c == 1) {
		    # remove last two charecters and add the counter tag
		    $name_ref->[$n] =~ s/.{2}$//;
		    $name_ref->[$n] .= ".$c";
		}
		
		# first remove spaces at the end
		$name_ref->[$m] =~ s/ //g;
		
		# remove last two charecters and add the counter tag
		$c ++;
		$name_ref->[$m] =~ s/.{2}$//;
		$name_ref->[$m] .= ".$c";  
		
		# truncate or add spaces again
		if ($name_ref->[$m] =~ /^(.{$maxlen})/){
		    $name_ref->[$m] = $1;
		}		
		while (length($name_ref->[$m]) < $maxlen) {
		    $name_ref->[$m] .= " ";
		}
	    }
	}
    }  
    if ($verbose) {
	for (my $n = 0; $n < $nseq; $n ++) {
	    print "NEW: $name_ref->[$n]\n";
	}
    }
  
}

sub write_msa {
    my ($nseq, $msa_ref, $name_ref) = @_;

    my $block = 50;

    my $alen = length($msa_ref->[0]);
    for (my $n = 1; $n < $nseq; $n ++) {
	if ($alen != length($msa_ref->[$n])) {
	    print "write_msa_to_file(): bad alignment\n"; die; 
	}
    }

    # Make sure that the names of the sequences are all different
    # and with max len 10
    if ($opt_n) { name_normalize($nseq, $name_ref); }


    print " $nseq $alen\n";
    for (my $n = 0; $n < $nseq; $n ++) {
	print "$name_ref->[$n]\t";

	$msa_ref->[$n] =~ s/^(\S{$block})//; print "$1\n";
    }
    
    while (length($msa_ref->[0]) >= $block) {
	print "\n";
	for (my $n = 0; $n < $nseq; $n ++) {
	    $msa_ref->[$n] =~ s/^(\S{$block})//; print "$1\n";
	}
    }
    if (length($msa_ref->[0]) > 0) {
	print "\n";
	for (my $n = 0; $n < $nseq; $n ++) {
	    print "$msa_ref->[$n]\n";
	}
    }
    
    if ($verbose) { system("more $msafile\n"); }
}

