#!/usr/bin/perl
#use strict;
use Bio::SeqIO;
use List::Util qw(max);

#=================================
#Script to generate length summary stats and trim sequences
#briandotoakleyatarsdotusdadotgov
#k.purdy at warwick.ac dot uk
#=================================

#---define input file----
my $usage = "\nUsage:  $0 filename\n\n";
        $input_file =$ARGV[0] or die $usage;
	chomp $input_file;
        print "\nFirst determining length distributions and number of Ns...\n";
        
    my $seq_in  = Bio::SeqIO->new( -format => 'fasta',-file => $input_file);
   
   
#---define output file----
    $out = Bio::SeqIO->new(-file => '>seq_length_summary.fas', -format => 'fasta');

#--Push all sequences into array--
    my $seq;
    my @seq_array;
    while( $seq = $seq_in->next_seq() ) {
       push(@seq_array,$seq);
    }

# Now do something with these. First sort by length,
# Find the Mean, median, min, max lengths and print them out
    @seq_array = sort { $a->length <=> $b->length } @seq_array;
    my $total = 0;
    my $count = 0;
    my $N_match_count = 0;
    my $num_seqs_w_atleast_1N = 0;
 
    foreach my $seq_obj (@seq_array) {
  	push (@seq_lengths, $seq_obj->length);
    	    	
#---Other basic stats and N counting stuff-----------    	
    	$N_match_count = $seq_obj->seq() =~ tr/N/N/;
	$N_match_count_total+=$N_match_count;
       	$total+=$seq_obj->length;
       	$count+=1;
       	if ($N_match_count>0) {
       	$num_seqs_w_atleast_1N++;
       	}
    }
 
#---Use hash '%count' to count occurences of each length-----
  %count=();
foreach $value ( @seq_lengths ) {
$count{$value}++;
} 

$hash_size = scalar keys %count;

#----print out Length Stats-----------
	print "\n\nLength Summary Statistics:\n",
	"--------------------------";
	print
	"\nMin = ", $seq_array[0]->length,
	"\nMean = ",$total/$count,
	"\nMax = ", $seq_array[$count-1]->length,
	"\nMedian = ",$seq_array[$count*0.5]->length,
	"\n--------------------------",
	"\n\n";
	

if ($hash_size<50) {
print "length \t count\n";
print "------ \t -----\n";
for $key (sort {$a<=>$b} keys %count) {
my $value = $count{$key};
print "$key \t $value\n";
}
print "\n\n";
}

if ($hash_size>50) {
print "Length frequency table is too large to display well and so is written to 'seq_length_counts.txt'\n\n";
open (OUT,">seq_length_counts.txt") or die "\nOutput file unable to open. You may have it open in another window.\n";
print OUT ("length \t count\n");
for $key (sort {$a<=>$b} keys %count) {
my $value = $count{$key};
print OUT ("$key \t $value\n");
}
}
close OUT;
#----Print out N stats------

	print "\nN Summary Statistics:\n",
	"--------------------------";
	print "\nMean number of N's = ", $N_match_count_total/$count;
	print "\nNumber of seqs with one or more N's = ", $num_seqs_w_atleast_1N,

	"\n--------------------------",
	"\n\n";
                
#---------TRIM SEQS ACCORDING TO MIN AND MAX LENGTH INPUTS, REMOVE N's-------      
                
print "\nWhat is the minimum sequence length to keep?....";
chomp ($min_length_screen=<STDIN>);

print "\nWhat is the maximum sequence length to keep?....";
chomp ($max_length_trim=<STDIN>);

print "\nWhat is the maximum number of 'N's acceptable?....";
chomp ($N_cutoff=<STDIN>);

my $num_OK_seqs=0;
foreach my $seq_obj (@seq_array) {
   $seq_length = $seq_obj->length();
   $N_match_count = $seq_obj->seq() =~ tr/N/N/;
   if (($N_match_count<=$N_cutoff)&&($seq_length>=$min_length_screen)) {
      if ($seq_length<$max_length_trim) {
         $out->write_seq($seq_obj->trunc(1,$seq_length));
         $num_OK_seqs++;
         }
      if ($seq_length>$max_length_trim) {
         $out->write_seq($seq_obj->trunc(1,$max_length_trim));
         $num_OK_seqs++;
         }
     
   }
}

print "\n$num_OK_seqs sequences between $min_length_screen and $max_length_trim bp and
with fewer than $N_cutoff Ns written to 'seq_length_summary.fas' \n\n";