#!/usr/bin/perl #use strict; use Bio::SeqIO; use List::Util qw(max); #================================= #Script to generate length summary stats and trim sequences #briandotoakleyatarsdotusdadotgov #k.purdy at warwick.ac dot uk #================================= #---define input file---- my $usage = "\nUsage: $0 filename\n\n"; $input_file =$ARGV[0] or die $usage; chomp $input_file; print "\nFirst determining length distributions and number of Ns...\n"; my $seq_in = Bio::SeqIO->new( -format => 'fasta',-file => $input_file); #---define output file---- $out = Bio::SeqIO->new(-file => '>seq_length_summary.fas', -format => 'fasta'); #--Push all sequences into array-- my $seq; my @seq_array; while( $seq = $seq_in->next_seq() ) { push(@seq_array,$seq); } # Now do something with these. First sort by length, # Find the Mean, median, min, max lengths and print them out @seq_array = sort { $a->length <=> $b->length } @seq_array; my $total = 0; my $count = 0; my $N_match_count = 0; my $num_seqs_w_atleast_1N = 0; foreach my $seq_obj (@seq_array) { push (@seq_lengths, $seq_obj->length); #---Other basic stats and N counting stuff----------- $N_match_count = $seq_obj->seq() =~ tr/N/N/; $N_match_count_total+=$N_match_count; $total+=$seq_obj->length; $count+=1; if ($N_match_count>0) { $num_seqs_w_atleast_1N++; } } #---Use hash '%count' to count occurences of each length----- %count=(); foreach $value ( @seq_lengths ) { $count{$value}++; } $hash_size = scalar keys %count; #----print out Length Stats----------- print "\n\nLength Summary Statistics:\n", "--------------------------"; print "\nMin = ", $seq_array[0]->length, "\nMean = ",$total/$count, "\nMax = ", $seq_array[$count-1]->length, "\nMedian = ",$seq_array[$count*0.5]->length, "\n--------------------------", "\n\n"; if ($hash_size<50) { print "length \t count\n"; print "------ \t -----\n"; for $key (sort {$a<=>$b} keys %count) { my $value = $count{$key}; print "$key \t $value\n"; } print "\n\n"; } if ($hash_size>50) { print "Length frequency table is too large to display well and so is written to 'seq_length_counts.txt'\n\n"; open (OUT,">seq_length_counts.txt") or die "\nOutput file unable to open. You may have it open in another window.\n"; print OUT ("length \t count\n"); for $key (sort {$a<=>$b} keys %count) { my $value = $count{$key}; print OUT ("$key \t $value\n"); } } close OUT; #----Print out N stats------ print "\nN Summary Statistics:\n", "--------------------------"; print "\nMean number of N's = ", $N_match_count_total/$count; print "\nNumber of seqs with one or more N's = ", $num_seqs_w_atleast_1N, "\n--------------------------", "\n\n"; #---------TRIM SEQS ACCORDING TO MIN AND MAX LENGTH INPUTS, REMOVE N's------- print "\nWhat is the minimum sequence length to keep?...."; chomp ($min_length_screen=); print "\nWhat is the maximum sequence length to keep?...."; chomp ($max_length_trim=); print "\nWhat is the maximum number of 'N's acceptable?...."; chomp ($N_cutoff=); my $num_OK_seqs=0; foreach my $seq_obj (@seq_array) { $seq_length = $seq_obj->length(); $N_match_count = $seq_obj->seq() =~ tr/N/N/; if (($N_match_count<=$N_cutoff)&&($seq_length>=$min_length_screen)) { if ($seq_length<$max_length_trim) { $out->write_seq($seq_obj->trunc(1,$seq_length)); $num_OK_seqs++; } if ($seq_length>$max_length_trim) { $out->write_seq($seq_obj->trunc(1,$max_length_trim)); $num_OK_seqs++; } } } print "\n$num_OK_seqs sequences between $min_length_screen and $max_length_trim bp and with fewer than $N_cutoff Ns written to 'seq_length_summary.fas' \n\n";