############################ # simulate serial founders # # and track diversity # # Robin Allaby UoW 2018 # ############################ ################################### # v2 update take into account # # mating strategy and incremental # # change over time # ################################### ##################################### # v4 update select individuals from # # the population, generate gametes # # to make next generation. # ##################################### ###################################### # v5 update introduce a mutation rate# # so allele frequency array will # # expand during the course of the # # simulation # ###################################### ###################################### # v6 update introduce a user defined # # starting heterozygosity to reflect # # reality better # ###################################### $N = $ARGV[0]; #population size $Nb = $ARGV[1]; # founders $a = $ARGV[2]; # no. alleles $f = $ARGV[3]; #no. founder events $m = $ARGV[4]; # model of gene frequency generation 0, uniform, x, average frequency of most common allele $ms = $ARGV[5]; # mating strategy $deltams = $ARGV[6]; # rate of change of mating strategy $mu = $ARGV[7]; $fast = $ARGV[8]; #fast option (1) continually cleans out allele array losing lost lineages, and losing tracking allele histories $starthet = $ARGV[9]; $startms = $ARGV[10]; # which mating system to use to calculate the starting frequencies open (OUT, ">founder.out") or die "Dead\n"; open (TEST, ">testfile") or die "No testfile\n"; #initialize by attributing allele frequencies - drawn from a uniform distribution @allele_freqs = (); @raw_freqs = (); $x = 0; while ($x < $a) { $allele = int rand(1000); $allele = ($allele+1)/1000; push (@raw_freqs, $allele); $x++; } #print "m: $m\n"; #print "raw freqs before: @raw_freqs\n"; #if ($m > 0) { # #amplify the first frequency to reflect the input frequency average # $value = $raw_freqs[0]; # $factor = $m/(1-$m); # $value = $m*$factor; # assumes equable frequencies (uniform), amplified by a factor that would boost # # the equable frequency up to the $m frequency # splice (@raw_freqs, 0, 1, $value); #} #print "after: @raw_freqs\n"; exit; if ($m == 1) { } $total = 0; foreach $raw (@raw_freqs) { $total = $total + $raw;} foreach $raw (@raw_freqs) { $freq = $raw/$total; push (@allele_freqs, $freq); } $F = estimate_inbreeding_coefficient ($ms); print TEST "F: $F\t"; print "allele frequencies: @allele_freqs\n"; # work out diversity $allele_freqs = join ("\t", @allele_freqs); if ($m == 1) { print "searching for allele frequencies....\n"; $startF = estimate_inbreeding_coefficient($startms); $next = 0; $het = calculate_heterozygosity ($allele_freqs, $startF); if ($het > $starthet) { $gounder = 1;} else {$gounder = 0;} @old_allele_freqs = (); @old_allele_freqs = @allele_freqs; do { # randomly take 0.01 off one frequency and add to another $loser = rand ($a); $winner = rand ($a); $losingvalue = $allele_freqs[$loser]; $losingvalue = $losingvalue - 0.001; if ($losingvalue < 0) {$losingvalue = 0;} $winningvalue = $allele_freqs[$winner]; $winningvalue = $winningvalue + 0.001; splice (@allele_freqs, $loser,1,$losingvalue); splice (@allele_freqs, $winner,1,$winningvalue); $raw_freqs = join ("\t", @allele_freqs); $allele_freqs = convert_raw_to_pdf ($raw_freqs); # possible maniuplations above can result in frequencies no longer summing to 1 @allele_freqs = (); @allele_freqs = split (/\t/, $allele_freqs); $newhet = calculate_heterozygosity ($allele_freqs, $startF); $accept = 0; if ($newhet < $het) { if ($gounder == 1) {$accept = 1;} else {$accept = 0;} } else { if ($gounder == 1) {$accept = 0;} else {$accept = 1;}} if ($accept == 1) { @old_allele_freqs = (); @old_allele_freqs = @allele_freqs; $het = $newhet;} else {@allele_freqs = (); @allele_freqs = @old_allele_freqs;} if ($gounder == 1) { if ($het < $starthet) {$next = 1;}} else {if ($het>$starthet) {$next = 1;}} print "."; } until ($next == 1); # take out your dead.... $allele_freqs = clean_out_the_dead ($allele_freqs); @allele_freqs = (); @allele_freqs = split (/\t/, $allele_freqs); $a = scalar (@allele_freqs); } print "allele frequncies: @allele_freqs\n"; print "het: $het\n"; #exit; #$het = calculate_heterozygosity ($allele_freqs); #print "heterozygosity: $het\n"; exit; $g = 0; #draw out N individuals from infinite population derive new frequencies $n = 0; @new_raw = (); #@het_array = (); foreach $al (@allele_freqs) {push (@new_raw, 0);} $allele_freqs = join ("\t", @allele_freqs); print TEST "allele freqs: @allele_freqs\n"; # to check my probabilities here $flop = 0; foreach $e (@allele_freqs) { $hetslist = one_times_all ($allele_freqs, $flop); $hetstot = add_up_array($hetslist); $hetsprop = ($hetstot-($F*$hetstot)); $phom = ($e*$e) + ($F*$hetstot); $conditionalp = $phom/($phom+$hetsprop); print TEST "allele $e phom: $phom\t conditional: $conditional\thetstotal: $hetstot\thetsprop: $hetsprop\n"; $flop++; } $genotype_matrix4 = make_genotype_matrix ($allele_freqs, $F); $alleleno = scalar @allele_freqs; $allelehi = $alleleno - 1; print TEST "genotype matrix: $genotype_matrix4\n"; while ($n < $N) { $ind1 = draw_individual2 ($genotype_matrix4, $a); #print TEST "ind1:\t$ind1\n"; $seed = generate_seed (1000000000000); if ($seed > $ms) { $ind2 = draw_individual2 ($genotype_matrix4, $a); #print TEST "$ind1\n"; } else { $ind2 = $ind1;} $prog = mate_individuals ($ind1, $ind2); @prog = (); @prog = split (/\t/, $prog); $ad = $prog[0]; $sad = $prog[1]; # opportunity for mutation @newprog = (); foreach $allele (@prog) { $check = check_mutation($mu); if ($check ==1) { # we add another allele to the list $allelehi++; #$a++; push (@newprog, $allelehi); push (@new_raw, 0); } else { push (@newprog, $allele); } } @prog = (); @prog = @newprog; $ad = $prog[0]; $sad = $prog[1]; # print TEST "$ad\t$sad\n"; #@ind = (); #@ind = split (/\t/, $ind1); #$ad = $ind[0]; $sad = $ind[1]; $curr = $new_raw[$ad]; $curr++; splice (@new_raw, $ad, 1, $curr); $curr = $new_raw[$sad]; $curr++; splice (@new_raw, $sad, 1, $curr); $n++; } $new_raw = join ("\t", @new_raw); if ($fast == 1) { $new_raw = clean_out_the_dead($new_raw);} # this will make it run faster by losing dead lineages, but lose allele tracking $new_freqs = convert_raw_to_pdf ($new_raw); #@new_freqs = split (\t\, $new_freqs); print TEST "new freqs: $new_freqs\n"; $het = calculate_heterozygosity ($new_freqs, $F); print OUT "$het\n"; while ($g < $f) { # begin with founder if ($g == 0) {print TEST "$g\t$new_freqs\n";} #print TEST "go in :\t$new_freqs\n"; $new_raw = sample_N_from_pdf ($Nb, $new_freqs, $F, $ms, $mu); if ($fast == 1) { $new_raw = clean_out_the_dead($new_raw);} # this will make it run faster by losing dead lineages, but lose allele tracking $new_freqs = convert_raw_to_pdf ($new_raw); print TEST "$g\tnew freqs\t$new_freqs\tnew raw:\t$new_raw\n"; #expand to N #exit; #print TEST "go in :\t$new_freqs\n"; $new_raw = sample_N_from_pdf ($N, $new_freqs, $F, $ms, $mu); if ($fast == 1) { $new_raw = clean_out_the_dead($new_raw);} # this will make it run faster by losing dead lineages, but lose allele tracking $new_freqs = convert_raw_to_pdf ($new_raw); #print TEST "$g\t$new_freqs\n"; print TEST "\tnew freqs:\t$new_freqs\tnew raw:\t$new_raw\n"; $het = calculate_heterozygosity ($new_freqs, $F); print OUT "$het\n"; $g++; print "$g\n"; $ms = $ms + $deltams; unless ($deltams == 0) {$F = estimate_inbreeding_coefficient($ms);} } close OUT; close TEST; ######################### # SUBROUTINES # ######################### sub clean_out_the_dead { $allele_freqs5 = $_[0]; # remove all zero values from the array to stop inordinate expansion my @allele_freqs5 = (); my $frequency1 = (); my @allele_freqs6 = (); my $allele_freqs6 = (); @allele_freqs5 = split (/\t/, $allele_freqs5); foreach $frequency1 (@allele_freqs5) { unless ($frequency1 == 0) { push (@allele_freqs6, $frequency1);} } $allele_freqs6 = join ("\t", @allele_freqs6); return $allele_freqs6; } sub check_mutation { my $murate = $_[0]; my $seed7 = (); my $mutate = 0; my $seed7 = generate_seed (1000000000000000000000); if ($seed7 < $murate) {$mutate = 1;} return $mutate; } sub mate_individuals { my $ind11 = $_[0]; my $ind12 = $_[1]; my @prog1 = (); my @ind11 = (); my @ind12 = (); my $seed2 = (); my $gam1 = (); my $gam2 = (); my $prog1 = (); @ind11 = split (/\t/, $ind11); @ind12 = split (/\t/, $ind12); #print TEST "ind1:$ind11[0]\t$ind11[1]\tind2:$ind12[0]\t$ind12[1]\n"; $seed2 = generate_seed (10000000000000); if ($seed2 > 0.5) { $gam1 = $ind11[1];} else {$gam1 = $ind11[0];} $seed2 = generate_seed (10000000000000); if ($seed2 > 0.5) { $gam2 = $ind12[1];} else {$gam2 = $ind12[0];} push (@prog1, $gam1, $gam2); $prog1 = join ("\t", @prog1); return $prog1; } sub make_genotype_matrix { my $allele_freqs4 = $_[0]; my $F = $_[1]; # new method - make matrix of genotypes expected and draw from that (no biased gametes!) my @genotype_matrix = (); my $x2= (); my $allele3 = (); my $hetstot2 = (); my $adjusted_hettot = (); my $homo = (); my $x3 = (); my $allele4 = (); my $het2 = (); my $genotype_matrix = (); my @allele_freqs4 = (); my $hetstot2list = (); @allele_freqs4 = split (/\t/, $allele_freqs4); #print TEST "frequencies going in: $allele_freqs4\n"; $x2 = 0; foreach $allele3 (@allele_freqs4) { $hetstot2list = one_times_all ($allele_freqs4, $x2); $hetstot2 = add_up_array ($hetstot2list); $adjusted_hettot = $hetstot2*$F; #print TEST "hetstot2:\t$hetstot2\tadjusted:\t$adjusted_hettot\n"; $homo = ($allele3*$allele3) + $adjusted_hettot; #print TEST "homo:\t$homo\n"; $x3 = 0; foreach $allele4 (@allele_freqs4) { if ($x3 == $x2) { push (@genotype_matrix, $homo); } else { $het2 = ($allele4*$allele3) - ($F*($allele4*$allele3)); push (@genotype_matrix, $het2); } $x3++; } $x2++; } $genotype_matrix = join ("\t", @genotype_matrix); #print TEST "genotype_matrix: $genotype_matrix\n"; return $genotype_matrix; } sub draw_individual2 { my $genotype_matrix2 = $_[0]; my $a3 = $_[1]; $seed6 = generate_seed (10000000000000); $ad6 = sample_from_distribution ($genotype_matrix2, $seed6); #print TEST "ad6: $ad6\n"; $ad6++; # have to switch from 0 counting to 1 counting $genotype2 = retrieve_genotype ($ad6, $a3); return $genotype2; } sub retrieve_genotype { my $ad7 = $_[0]; my $a4 = $_[1]; my @genotype_matrix = (); my $first = (); my $second = (); my @genotype3 = (); my $genotype3 = (); my $secondint = (); my $secondfrac = (); my $firstfrac = (); my $whole = (); my $number1 = (); my @number1 =(); my $int = (); my $frac = (); @genotype_matrix = split (/\t/, $genotype_matrix); $whole = $ad7/$a4; #print "whole: $whole\n"; $number1 = break_up_number ($whole); @number1 = split (/\t/, $number1); $int = $number1[0]; $frac = $number1[1]; #if ($whole>$int) { $int++;} #$seconda = $secondint-1; $firstfrac = ($frac*$a4); if ($firstfrac == 0) {$first = ($a4-1);} else {$first = $firstfrac - 1;} if ($frac == 0) {$int--;} #print "first:\t$first\nsecond:\t$int\n"; #if ($first < 1) {$first = 0; #print TEST "first corrected to $first"; #} #print TEST "\n"; push (@genotype3, $first, $int); $genotype3 = join ("\t", @genotype3); return $genotype3; } sub break_up_number { #my alternative to the int function which causes black magic weirdness my $number = $_[0]; my @number = (); my $next = 0; my $reduced = $number; my $x7 = 0; my $integer = (); my $fraction = (); my $broken_number = (); do { $reduced = $reduced - 1; if ($reduced < 0) {$next = 1;} $x7++; } until ($next==1); #unless ($reduced == 0) {$reduced = $reduced +1;} $integer = $x7-1; $fraction = $number-$integer; push (@number, $integer, $fraction); $broken_number = join ("\t", @number); #print "broken number: integer($integer), fraction($fraction)\n"; return $broken_number; } sub draw_individual { my $allele_freqs1 = $_[0]; my $F = $_[1]; my $seed1 = (); my $ad1 = (); my $hetslist1 = (); my $hetstot1 = (); my $allele1 = (); my $hetsprop1 = (); my $homprop1 = (); my $phom1 = (); my $sad1 = (); my $next1 = (); my $tries1 = (); my @genotype1 = (); my $genotype1 = (); print TEST "drawing individual from:\t$allele_freqs1\t"; $seed1 = generate_seed (10000000000000); $ad1 = sample_from_distribution ($allele_freqs1, $seed1); print TEST "ad: $ad1\t"; $hetslist1 = one_times_all ($allele_freqs1, $ad1); $hetstot1 = add_up_array ($hetslist1); $allele1 = $allele_freqs1[$ad1]; $hetsprop1 = ($hetstot1-($F*$hetstot1)); #conditional hetsprop - only half of HWE because we have first allele #remaining frequency must be made up of homs $homprop1 = ($allele1*$allele1) + ($F*$hetstot1); # this is the probability if drawing from start - we already know if ($homprop1 == 0) { $phom1 = 0; #print TEST "homprop failure\n"; } else { $phom1 = $homprop1/($homprop1+$hetsprop1); # prob of homology given the first allele.... } $seed1 = generate_seed(1000000); if ($seed1 < $phom1) { $sad1 = $ad1; } else { $next1 = 0; $tries1 = 0; do { $seed1 = generate_seed (1000000); $sad1 = sample_from_distribution ($allele_freqs1, $seed1); unless ($sad1 == $ad1) {$next1 = 1;} $tries1++; if ($tries1 == 1000) { print TEST "hit 1000\t"; $next1 = 1;} } until ($next1 == 1); } push (@genotype1, $ad1, $sad1); $genotype1 = join ("\t", @genotype1); print TEST "genotype: $genotype1\n"; return $genotype1; } sub estimate_inbreeding_coefficient { my $ms = $_[0]; my $p = 0.5; my $q = 0.5; #start off with Hardy-Weinberg proportions, and let the frequencies settle out my $AA = $p**2; my $Aa = 2*$p*$q; my $aa = $q**2; my $next = 0; my $oldF = 0; my $Ps = (); my $Rs = (); my $Qs = (); my $Pc = (); my $Rc = (); my $Qc = (); my $P = (); my $R = (); my $Q = (); my $newP = (); my $newR = (); my $newQ = (); my $F = (); my $df = (); print "Estimating F:\n"; do { $Ps = $ms*($AA + 0.25*$Aa); $Rs = $ms*($aa + 0.25*$Aa); $Qs = $ms*(0.5*$Aa); $Pc = (1-$ms)*($AA**2 + 2*0.5*$AA*$Aa + 0.25*$Aa*$Aa); $Rc = (1-$ms)*($aa**2 + 2*0.5*$aa*$Aa + 0.25*$Aa*$Aa); $Qc = (1-$ms)*(2*$AA*$aa + 0.5*$Aa*$Aa + 2*0.5*$AA*$Aa + 2*0.5*$aa*$Aa); $P = $Pc + $Ps; $R = $Rc + $Rs; $Q = $Qc + $Qs; # new proportions $newP = ($P)/($P + $Q + $R); $newQ = $Q/($P + $Q + $R); $newR = $R/($P + $Q + $R); $newq = ($Q/2 + $R)/($P + $Q + $R); $newp = 1 - $newq; #calculate F on the fly $F = ($newR - $newq**2)/($newq*$newp); #print "$F\n"; if ($F > $oldF) {$dF = $F- $oldF;} else {$dF = $oldF - $F;} if ($dF < 0.0000000001) { $next = 1;} if ($dF == 0) {$next = 1;} $oldF = $F; # reset for next gen $AA = $newP; $Aa = $newQ; $aa = $newR; } until ($next == 1); return $F; } sub sample_N_from_pdf { my $N = $_[0]; my $pdf1 = $_[1]; my $F = $_[2]; my $ms = $_[3]; my $mu = $_[4]; my @pdf1 = (); my $allele2 = (); @pdf1 = split (/\t/, $pdf1); my @new_raw1 = (); my $a2 = (); my $n2 = 0; my $seed3 = (); my $curr1 = (); my $ad2 = (); my $sad2 = (); my $ind21 = (); my $ind22 = (); my $prog2 = (); my @prog2 = (); my $scurr1 = (); my $new_raw1 = (); #my $cr1 = (); my $a6 = (); my @newprog2 = (); my $check1 = (); $a6 = scalar @pdf1; foreach $a2 (@pdf1) {push (@new_raw1, 0);} # print TEST "pdf is :\t$pdf1\n"; $genotype_matrix3 = make_genotype_matrix ($pdf1, $F); #print "genotype matrix: $genotype_matrix3, a6: $a6\n"; $totsall = scalar @pdf1; $allelehi2 = $totsall - 1; while ($n2 < $N) { $ind21 = draw_individual2 ($genotype_matrix3, $a6); $seed3 = generate_seed (1000000000000); if ($seed3 > $ms) { $ind22 = draw_individual2 ($genotype_matrix3, $a6); # print TEST "0\t"; } else { # print TEST "1\t"; $ind22 = $ind21;} $prog2 = mate_individuals ($ind21, $ind22); @prog2 = (); @prog2 = split (/\t/, $prog2); $ad2 = $prog2[0]; $sad2 = $prog2[1]; # opportunity for mutation @newprog2 = (); foreach $allele2 (@prog2) { $check1 = check_mutation($mu); if ($check1 ==1) { # we add another allele to the list $allelehi2++; #$a++; push (@newprog2, $allelehi2); push (@new_raw1, 0); } else { push (@newprog2, $allele2); } } $check1 = 0; @prog2 = (); @prog2 = @newprog2; $ad2 = $prog2[0]; $sad2 = $prog2[1]; #print TEST "$ad2\t$sad2\n"; #my @ind = (); #@ind = split (/\t/, $ind1); #$ad = $ind[0]; $sad = $ind[1]; $curr1 = $new_raw1[$ad2]; $curr1++; splice (@new_raw1, $ad2, 1, $curr1); $scurr1 = $new_raw1[$sad2]; $scurr1++; splice (@new_raw1, $sad2, 1, $scurr1); $n2++; } $new_raw1 = join ("\t", @new_raw1); #print TEST "new raw is:$new_raw1\n"; # exit; return $new_raw1; } sub convert_raw_to_pdf { my $list1 = $_[0]; my @list1 = (); my @list1 = split (/\t/, $list1); my $total1 = add_up_array ($list1); my $bit1 = (); my $val1 = (); my @pdf2 = (); my $pdf2 = (); foreach $bit1 (@list1) { $val1 = $bit1/$total1; push (@pdf2, $val1); } $pdf2 = join ("\t", @pdf2); return $pdf2; } sub sample_from_distribution { my $pdf3 = $_[0]; # print TEST "pdf3:\t$pdf3\n"; my $seed4 = $_[1]; # print TEST "seed4:\t$seed4\t"; my @pdf3 = (); @pdf3 = split (/\t/, $pdf3); my $c1 = 0; my $val = (); my $cum = 0; my $found = 0; my $ad4 = (); foreach $val (@pdf3) { $cum = $cum + $val; if ($seed4 < $cum) { if ($found == 0) { $ad4 = $c1; $found = 1; } } $c1++; } if ($pdf3[$ad4] == 0) {print TEST "fault:\npdf:$pdf3\nseed:$seed4\nad:$ad4\n";exit;} #print TEST "chose $ad4\n"; return $ad4; } sub generate_seed { my $n = $_[0]; my $seed5 = int rand ($n); $seed5 = (($seed5+0.00000000000001)/($n+0.00000000000001)); return $seed5; } sub add_up_array { my $list = $_[0]; my @list = split (/\t/, $list); my $total2 = 0; my $bit = (); foreach $bit (@list) {$total2 = $total2 + $bit;} return $total2; } sub one_times_all { my $allele_freqs2 = $_[0]; my $ad5 = $_[1]; my @allele_freqs2 = (); my @binaries = (); @allele_freqs2 = split (/\t/, $allele_freqs2); my $x1 = 0; my $all = (); my $bin = (); foreach $all (@allele_freqs2) { unless ($x1 == $ad5) { $bin = $all*$allele_freqs2[$ad5]; push (@binaries, $bin); } $x1++; } $binaries = join ("\t", @binaries); return $binaries; } sub calculate_heterozygosity { my $allele_freqs3 = $_[0]; my $F = $_[1]; my @allele_freqs3 = (); my $phom2 = (); my @homozygosities = (); my $total_hom = (); my $het1 = (); my $homozygosities = (); @allele_freqs3 = split (/\t/, $allele_freqs3); #calculate all homozygosity probabilities my $c = 0; foreach $allele (@allele_freqs3) { # print "$allele\n"; $hetslist = one_times_all ($allele_freqs3, $c); $hetstot = add_up_array ($hetslist); $phom2 = ($allele*$allele) + ($F*$hetstot); push (@homozygosities, $phom2); $c++; } $homozygosities = join ("\t", @homozygosities); $total_hom = add_up_array ($homozygosities); #print "homozygosities: $homozygosities\n"; #print "total hom: $total_hom\n"; $het1 = 1 - $total_hom; return $het1; }