#! perl
#ThermoPhyl v1.4
#by Brian Oakley (b.b.c.oakley at warwick.ac.uk)  This script was originally written as a PERL
#learning exercise and to fulfill a need within our group and has since evolved based on
#comments of friends and colleagues.  PERL code can be extremely concise but here I have tried
#to use 'long-hand' to make things easy to follow and to modify based on your own needs.  

use List::Util qw(max);
system ('cls');

#---Default user options---
$instructions=Y;
$num_oligos=3;
$assay_column_headings=N;
$target_column_headings=N;
$Primer3_format=N;
#--------------------------
$option1_toggle=0;
$option3_toggle=1;
$option4_toggle=1;
$option5_toggle=1;

#----Begin sub intro screen--------
sub intro_screen {
system ('cls');
print "\nWelcome to ThermoPhyl v1.4
by Brian Oakley
b.b.c.oakley at warwick.ac.uk\n\n";

print "\nNumber:   Current     Option:
         Settings:\n
-------   --------    -------\n
   1         $instructions        Show instructions?\n
   2         $num_oligos        # of oligos?  Probing/PCR/TaqMan qPCR (1/2/3)\n
   3         $assay_column_headings        'candidate_assays.txt' file w/ column headings?\n
   4         $target_column_headings        'target_list.txt' file w/ column headings?\n
   5         $Primer3_format        Reformat 'candidate_assays.txt' file from BatchPrimer3?\n
\nEnter number of option to change or Y to run:  ";

$user_options=<>;
chomp $user_options;
if ($user_options==1) {
	 	$option1_toggle = 1 - $option1_toggle ;
		if ($option1_toggle==0) {
		$instructions=Y;
		}
		if ($option1_toggle==1) {
		$instructions=N;
		}
	}
		
if ($user_options==2) {
        print "\nEnter 1, 2, or 3 for number of oligos to test...";
        chomp (my $input=<>);
        if ($input == 1||$input == 2||$input == 3) {
		$num_oligos=$input;
		}
		}

if ($user_options==3) {
        $option3_toggle = 1 - $option3_toggle;
        if ($option3_toggle==0) {
		$assay_column_headings=Y;
		}
		if ($option3_toggle==1) {
		$assay_column_headings=N;
		}
		} 
		
if ($user_options==4) {
        $option4_toggle = 1 - $option4_toggle;
        if ($option4_toggle==0) {
		$target_column_headings=Y;
		}
		if ($option4_toggle==1) {
		$target_column_headings=N;
		}
		} 

if ($user_options==5) {
       $option5_toggle = 1 - $option5_toggle;
        if ($option5_toggle==0) {
		$Primer3_format=Y;
		}
		if ($option5_toggle==1) {
		$Primer3_format=N;
		}
		} 
}
#------End sub intro screen----------
#---Show intro screen until user returns 'y'---- 

until ($user_options eq 'y'||$user_options eq 'Y') {
&intro_screen;
}

if ($instructions eq "Y") {
&verbose_intro;
}

#----Begin sub verbose intro--------
sub verbose_intro {
print "\n------------------------------------------------
This script will identify optimally specific and sensitive 
assays for FISH, traditional PCR, or quantitative-PCR based on defined target 
and non-target groups.
------------------------------------------------

The output is a text list of which sequences matched which assays
for both target and non-target groups and should greatly aid in 
designing sensitive and specific assays.\n
*******************************************************


All the user has to do is provide the following three files:

1)  'outside_world.fas'  File containing ALL relevant sequences in fasta format. 

2)  'target_list.txt'  These are the names of your target sequences saved in a file with a single column.  

These names must match the appropriate sequence IDs in your 'outside world.fas' file. 

3) 'candidate_assays.txt'  Format can be in single column as output from BatchPrimer3 or 2 columns in the order of Forward primer, 
Reverse primer.  For qPCR, use 3 columns in the order Forward primer, Probe, Reverse primer.

PLEASE NOTE:  All files must be saved in the same directory as this script.

*******************************************************\n";
}
#------End sub verbose intro----------

#----Read input files-----
print "\nPress enter to continue.  Will first read target list...\n";
<STDIN>;
open (INPUT, "<target_list.txt") or die "\nTarget list file is missing. Please try again.\n";
@targets=<INPUT>;
if ($target_column_headings eq "Y") {
	shift @targets;
}
$targets = "@targets";
@targets = split(/\n/, $targets);
	my %seen = ();
	foreach $line(@targets) {
	chomp $line;
        $line =~ s/[\s]//g;
	$line =~ s/\s+$//g;
        if ($line =~ m/[\d]|[\w]/) {
	push(@uniq_targets, $line) unless $seen{$line}++;
	}
        }
$number_in_target_list = scalar (@uniq_targets);

print  "\nYour target list contains the following $number_in_target_list unique sequence IDs:\n\n";

	foreach $line(@uniq_targets) {
	print "$line\n";
	}
 
#-----Next read in outside_world file, strip gaps, check for non-standard characters----
print "\nPress enter to continue.  Will next read in outside_world file and \ncompare to target list....\n";
<STDIN>;

open (INPUT, "<outside_world.fas") or die "\nOutside world file is missing. Please try again.\n";
@data=<INPUT>;
$test = "@data";
@all_sequences = split(/>/, $test);
shift @all_sequences;

foreach $entry (@all_sequences) {
	@lines = split(/\n/,$entry);
	$header = shift(@lines);
	$sequence = "@lines";
	$sequence=~s/\n|\t|\s|-|~//gi;
	push (@outside_world, $header."\n".$sequence);
		if ($sequence =~m /[^acgtACGT]/gi) {
		$non_standard_char=1;
                push (@bad_seqs, $header."\n".$sequence);
		}
}
	
#----Compare target list to outside world----
foreach $line(@uniq_targets) {
	my $found_counter=0;
	my $entry_counter=-1;
	foreach $entry (@outside_world) {
		$entry_counter+=1;
		if ($entry =~ m/$line/g)  {
		$found_counter+=1;
		push (@target_sequences, $entry);
		splice (@outside_world,$entry_counter,1);
		}
	}	
	
	if ($found_counter==0||$found_counter>1) {
	$error_counter+=1;
	if ($error_counter==1)  {
	print "\n------------------------------\n";
	print "\n \t  --- WARNING --- \n Target sequence    Number of matches\n  -------------      -------------\n";
	}  
	print " $line\t\t\t$found_counter\n";
	$warning=1;
	}
}
#---print warning if any discrepancies---
if ($warning==1) {
print "\n------------------------------
You have some discrepancy between your target list and your outside_world file.
The script will still run, but this will affect your sensitivity and specificity calculations.
\nTo solve the problem, first remove any non-standard characters 
(e.g. tabs, etc) from your target list.  
\nA simple unique identifier for each sequence like an accession number is best.  
If the problem persists, you probably have some real discrepancy 
between the two files that should be corrected.\n
------------------------------";  
}
#---Confirm matching between target list and outside world---
else {
print "\nNice work - all sequences in your target list were found once 
and only once in your outside_world file.\n";
}

@non_target_sequences = @outside_world;
$number_of_target_seqs = scalar (@target_sequences);
$number_of_nontarget_seqs = scalar (@non_target_sequences);
#----End comparison of target list to outside world and reading and parsing of input files-----

print "\nPress enter to proceed.  Will next read candidate assays...\n"; 
<STDIN>;

#---Invoke subroutines based on user options---- 
if ($num_oligos==1) {
        FISH ();
    }
elsif ($num_oligos==2) {
        trad_PCR ();
    } 
elsif ($num_oligos==3) {
        qPCR ();        
    } 
  
#----Begin sub FISH--------
sub FISH {
open (INPUT, "<candidate_assays.txt")or die "\nCandidate assays file is missing. Please try again.\n";;
chomp (@P=<INPUT>);
if ($assay_column_headings eq "Y") {
	shift @P;
	}
	my %seen = ();
	foreach $P(@P) {
	push(@uniq_P, $P) unless $seen{$P}++;
	}

$num_of_assays = scalar(@uniq_P);

print "\nYou have ",scalar(@P)," Probes, of which $num_of_assays are unique.\n"; 

<STDIN>;

print "\nWill now determine sensitivity and specificity for each probe.
\nOutput may take a little while.  Press enter when ready...\n";
<STDIN>;

open(OUT,">raw_search_results.txt") or die "Problem opening output file. You may have it open in another window\n";
open(OUT1,">sorted_search_results.txt") or die "Problem opening output file. You may have it open in another window\n";
print OUT ("Target(Y/N)\tProbe_num\tCumulative_Matches\tPosition\tProbe\tMatch\n");
print OUT1 ("Probe\tPosition\tTarget_matches\tNon_target_matches\tProbe\n");

$a=0;
while ($a<$num_of_assays) {
		$a += 1;   	 
		$P = shift(@uniq_P);
		
print "Testing $a of $num_of_assays probes against $number_of_target_seqs targets and $number_of_nontarget_seqs non-targets\n";

$target_match_tally = 0;
$non_target_match_tally = 0;

foreach $entry (@target_sequences) {
	@lines = split(/\n/,$entry);
	$header = shift(@lines);
	$target_sequence = "@lines";
	$target_sequence=~s/-|~\s//gi;

if ($target_sequence =~m/$P/gi) {
$P_pos = index($target_sequence, $P);
$target_match_tally += 1;

print OUT ("Y\t$a\t$target_match_tally\t$P_pos\t$P\t$header\n");	
}
}
$max_target = max $target_match_tally;

foreach $entry (@non_target_sequences) {
	@lines = split(/\n/,$entry);
	$header = shift(@lines);
	$non_target_sequence = "@lines";
	$non_target_sequence=~s/-|~\s//gi;

if ($non_target_sequence =~m/$P/gi) {
$P_pos = index($non_target_sequence, $P);
$non_target_match_tally += 1;
print OUT ("N\t$a\t$non_target_match_tally\t$P_pos\t$P\t$header\n");	
}
}
$max_non_target = max $non_target_match_tally;
push @data_summary, {primer_pair=>$a,Probe=>$P,position=>$P_pos,target_matches=>$max_target,non_target_matches=>$max_non_target};
}
}
#------End sub FISH----------

#------Begin sub trad_PCR--------
sub trad_PCR {

if ($Primer3_format eq "Y") {
&Primer3_input;
}
elsif ($Primer3_format eq "N") {
&no_Primer3_input;
}
 
#--Begin nested_sub for no Primer3 format--
sub no_Primer3_input {
open (INPUT, "<candidate_assays.txt") or die "\nCandidate assays file is missing. Please try again.\n";
@primers=<INPUT>;
if ($assay_column_headings eq "Y") {
	shift @primers;
	}

	my %primers = ();
	foreach $line(@primers) {
	push(@uniq_assays, $line) unless $primers{$line}++;
	}

foreach $line(@uniq_assays) {
$line =~ s/\n//g;
@FR=split (/\t/, $line);
push (@F,$FR[0]);
push (@R,$FR[1]);
}
print "\nYou have ",scalar(@primers)," candidate assays, of which ",scalar(@uniq_assays)," are unique\n\n";
}

#--End nested_sub for no Primer3 format--
#--Begin nested_sub for Primer3 format--
sub Primer3_input {
open (INPUT, "<candidate_assays.txt") or die "\nCandidate assays file is missing. Please try again.\n";
chomp(@primers=<INPUT>);
if ($assay_column_headings eq "Y") {
	shift @primers;
	}
$num_of_Primer3_primers = scalar (@primers);
if ($num_of_Primer3_primers % 2==1) {
die "\n\nWARNING!  It appears you have an unequal number of forward and reverse primers.  Please check and try again.  Make sure there are no empty line returns at the end of the file\n\n";
}

my $counter=0;
my %primer3_primers = ();
while ($counter<$num_of_Primer3_primers-1) { 
	$concatenated_primers = ($primers[$counter]."\t".$primers[$counter+1]);
 	push(@uniq_assays, $concatenated_primers) unless $primer3_primers{$concatenated_primers}++;
$counter +=2;
}

foreach $line(@uniq_assays) {
$line =~ s/\n//g;
@FR=split (/\t/, $line);
push (@F,$FR[0]);
push (@R,$FR[1]);
}
print "\nYou have ",(scalar(@primers))/2," candidate assays, of which ",scalar(@uniq_assays)," are unique\n\n";
}
#--End nested_sub for Primer3 format--
$num_of_assays=scalar(@uniq_assays);
$num_F = scalar(@F);
$num_R = scalar(@R);

foreach $R (@R) {
	$R = reverse $R;
	$R =~ tr/ACGTacgt/TGCAtgca/;
       	push (@R_rev_comp, $R);
}
@R=@R_rev_comp;


print "\nWill now determine sensitivity and specificity for each unique primer pair.
\nOutput may take a little while.  Press enter when ready...\n";
<STDIN>;

open(OUT,">raw_search_results.txt") or die "Problem opening output file. You may have it open in another window\n";
open(OUT1,">sorted_search_results.txt") or die "Problem opening output file. You may have it open in another window\n";
print OUT ("Target(Y/N)\tPrimer_pair\tCumulative_Matches\tF_pos\tR_pos\tAmplicon_length\tF\tR_motif\tMatch\n");
print OUT1 ("Primer_pair\tTarget_matches\tNon_target_matches\tF\tF_pos\tR_motif\tR_pos\tamp_length\n");

$a=0;
while ($a<$num_of_assays) {
		$a += 1;   	 
		$F = shift(@F);
		$R = shift(@R);

print "Testing $a of $num_of_assays primers against $number_of_target_seqs targets and $number_of_nontarget_seqs non-targets\n";

$target_match_tally = 0;
$non_target_match_tally = 0;

foreach $entry (@target_sequences) {
	@lines = split(/\n/,$entry);
	$header = shift(@lines);
	$target_sequence = "@lines";
	$target_sequence=~s/-|~\s//gi;

if ($target_sequence =~m/$F.*$R/gi) {
$F_pos = index($target_sequence, $F);
$R_pos = index($target_sequence, $R);
$amp_length = ($R_pos - $F_pos);
$target_match_tally += 1;

print OUT ("Y\t$a\t$target_match_tally\t$F_pos\t$R_pos\t$amp_length\t$F\t$R\t$header\n");	
}
}
$max_target = max $target_match_tally;

foreach $entry (@non_target_sequences) {
	@lines = split(/\n/,$entry);
	$header = shift(@lines);
	$non_target_sequence = "@lines";
	$non_target_sequence=~s/-|~\s//gi;

if ($non_target_sequence =~m/$F.*$R/gi) {
$F_pos = index($non_target_sequence, $F);
$R_pos = index($non_target_sequence, $R);
$amp_length = ($R_pos - $F_pos);
$non_target_match_tally += 1;
print OUT ("N\t$a\t$non_target_match_tally\t$F_pos\t$R_pos\t$amp_length\t$F\t$R\t$header\n");	
}
}
$max_non_target = max $non_target_match_tally;
push @data_summary, {primer_pair=>$a,F_pos=>$F_pos,R_pos=>$R_pos,F_primer=>$F,R_primer=>$R,target_matches=>$max_target,non_target_matches=>$max_non_target,amp_length=>$amp_length};
}
}
#------End sub trad_PCR----------

#------Begin sub q_PCR---------
sub qPCR {
open (INPUT, "<candidate_assays.txt") or die "\nCandidate assays file is missing. Please try again.\n";
@input=<INPUT>;
if ($assay_column_headings eq "Y") {
	shift @input;
	}
	my %seen = ();
	foreach $line(@input) {
	push(@uniq_assays, $line) unless $seen{$line}++;
	}

$num_of_assays = scalar(@uniq_assays);

foreach $line(@uniq_assays) {
$line =~ s/\n//g;
@FPR=split (/\t/, $line);
push (@F,$FPR[0]);
push (@P,$FPR[1]);
push (@R,$FPR[2]);
}

$num_F = scalar(@F);
$num_P = scalar(@P);
$num_R = scalar(@R);
if ($num_F != $num_R||$num_F != $num_P) {
        print "You do not have equal numbers of primers and probes\n";
}

foreach $R (@R) {
	$R = reverse $R;
	$R =~ tr/ACGTacgt/TGCAtgca/;
	push (@R_rev_comp, $R);
}
@R=@R_rev_comp;

print "\nYou have ",scalar(@input)," candidate assays, of which $num_of_assays are unique\n\n";
print "\nWill now determine sensitivity and specificity for each primer pair.
\nOutput may take a little while.  Press enter when ready...\n";

<STDIN>;
open(OUT,">raw_search_results.txt") or die "Problem opening output file. You may have it open in another window\n";
open(OUT1,">sorted_search_results.txt") or die "Problem opening output file. You may have it open in another window\n";
print OUT ("Target(Y/N)\tPrimer_pair\tCumulative_Matches\tF_pos\tProbe_pos\tR_pos\tAmplicon_length\tF\tP\tR_motif\tMatch\n");
print OUT1 ("Primer_pair\tTarget_matches\tNon_target_matches\tF\tF_pos\tP\tR_motif\tR_pos\n");

$b=0;
while ($b<$num_of_assays) {
		$b += 1;   	 
		$F = shift(@F);
		$P = shift(@P);
		$R = shift(@R);
print "Testing $b of $num_of_assays primers against $number_of_target_seqs targets and $number_of_nontarget_seqs non-targets\n";
$target_match_tally = 0;
$non_target_match_tally = 0;

foreach $entry (@target_sequences) {
	@lines = split(/\n/,$entry);
	$header = shift(@lines);
	$target_sequence = "@lines";
	$target_sequence=~s/-|~\s//gi;

if ($target_sequence =~m/$F.*$P.*$R/gi) {
$F_pos = index($target_sequence, $F);
$P_pos = index($target_sequence, $P);
$R_pos = index($target_sequence, $R);
$amp_length = ($R_pos - $F_pos);
$target_match_tally += 1;

print OUT ("Y\t$b\t$target_match_tally\t$F_pos\t$P_pos\t$R_pos\t$amp_length\t$F\t$P\t$R\t$header\n");
}
}
$max_target = max $target_match_tally;

foreach $entry (@non_target_sequences) {
	@lines = split(/\n/,$entry);
	$header = shift(@lines);
	$non_target_sequence = "@lines";
	$non_target_sequence=~s/-|~\s//gi;

if ($non_target_sequence =~m/$F.*$P.*$R/gi) {
$F_pos = index($non_target_sequence, $F);
$P_pos = index($non_target_sequence, $P);
$R_pos = index($non_target_sequence, $R);
$amp_length = ($R_pos - $F_pos);
$non_target_match_tally += 1;
print OUT ("N\t$b\t$non_target_match_tally\t$F_pos\t$P_pos\t$R_pos\t$amp_length\t$F\t$P\t$R\t$header\n");

}
}
$max_non_target = max $non_target_match_tally;
push @data_summary, {primer_pair=>$b,F_primer=>$F,Probe=>$P,R_primer=>$R,target_matches=>$max_target,non_target_matches=>$max_non_target,F_pos=>$F_pos,R_pos=>$R_pos};
}
}

#-------End sub q_PCR----------

#---Organize data for sorted summary output----
@sorted_summary =  sort { $a->{non_target_matches} <=> $b->{non_target_matches} } @data_summary;
@sorted_summary =  sort { $b->{target_matches} <=> $a->{target_matches} } @sorted_summary;
if ($num_oligos==1) {
print OUT1 map {$_->{primer_pair}."\t".$_->{position}."\t".$_->{target_matches}."\t".$_->{non_target_matches}."\t".$_->{Probe}."\n"}@sorted_summary;
}
if ($num_oligos==2) {
print OUT1 map {$_->{primer_pair}."\t".$_->{target_matches}."\t".$_->{non_target_matches}."\t".$_->{F_primer}."\t".$_->{F_pos}."\t".$_->{R_primer}."\t".$_->{R_pos}."\t".$_->{amp_length}."\n"}@sorted_summary;
}
if ($num_oligos==3) {
print OUT1 map {$_->{primer_pair}."\t".$_->{target_matches}."\t".$_->{non_target_matches}."\t".$_->{F_primer}."\t".$_->{F_pos}."\t".$_->{Probe}."\t".$_->{R_primer}."\t".$_->{R_pos}."\n"}@sorted_summary;
}

#---Final Outputs---
if ($non_standard_char==1) {
print "\n-----------------------------\nWarning:  Sequences containing non-standard characters were detected.\nThis may affect your search results.  \nThe culprits are in the file 'bad_seqs.fas'.\n-----------------------------\n";
open (BAD_OUT, ">bad_seqs.fas");
foreach $seq (@bad_seqs) {
print BAD_OUT (">",$seq,"\n");
}
}
print "\nDONE.  Successfully performed ".(($num_of_assays * $number_of_target_seqs)+($num_of_assays * $number_of_nontarget_seqs))." comparisons\n";
print "\nWriting summary file 'sorted_search_results.txt'\n";
print "\nDetails of search results and assay details are in file 'raw_search_results.txt'\n";
close OUT;
close OUT1;
exit;