########################################### # A program to search through # # bam files and check the C to T # # mismatch rate in the first 3 positions # # relative to C to T generally and # # the total number of mismatches both # # in the first three positions and # # across read lengths # # Robin Allaby UoW 2017 # ########################################### $infile = $ARGV[0]; my @lines = (); # @lines=` samtools view $infile `; # for testing purposes only: open (IN, $infile) or die "Cannot open $infile\n"; @lines = ; close IN; open (TEST, ">outfile") or die "no outfile\n"; # either way, file is in the lines array my $first = 0; my $firstCT = 0; my $second = 0; my $secondCT = 0; my $third = 0; my $thirdCT = 0; my $outCT = 0; my $total = 0; my $C = 0; my $G = 0; my $T = 0; my $A = 0; my @outtots = (); $z =3; my @outcts = (); my @AT = (); my @AG = (); my @AC = (); my @TA = (); my @TG = (); my @TC = (); my @GA = (); my @GT = (); my @GC = (); my @CT = (); my @CA = (); my @CG = (); #and from the 3' end my @rAT = (); my @rAG = (); my @rAC = (); my @rTA = (); my @rTG = (); my @rTC = (); my @rGA = (); my @rGT = (); my @rGC = (); my @rCT = (); my @rCA = (); my @rCG = (); # ready all arrays while ($z < 20) {push (@outtots, 0);push (@outcts, 0); push (@AT, 0); push (@AG, 0); push (@AC, 0); push (@TA, 0); push (@TG, 0); push (@TC, 0); push (@GA, 0); push (@GT, 0); push (@GC, 0); push (@CT, 0); push (@CA, 0); push (@CG, 0); push (@rAT, 0); push (@rAG, 0); push (@rAC, 0); push (@rTA, 0); push (@rTG, 0); push (@rTC, 0); push (@rGA, 0); push (@rGT, 0); push (@rGC, 0); push (@rCT, 0); push (@rCA, 0); push (@rCG, 0); $z++;} foreach $line (@lines) { $newline = $line; $newline =~s/\t/ /g; $newline = convert_to_tab ($line); $newline = single_tab ($newline); #print TEST "$newline"; my @line = (); @line = split (/\t/, $newline); my $seq = $line[9]; my @seq = (); @seq = split ('', $seq); #my $rseq = reverse_string($seq); my @rseq = (); @rseq = split ('', $seq); foreach $base (@seq) { if ($base =~/A/){$A++;} if ($base =~/T/) {$T++;} if ($base =~/G/) {$G++;} if ($base =~/C/) {$C++;}} my $seqlength = scalar @seq; push (@seqlengths, $seqlength); foreach $bit (@line) { if ($bit =~/XM/) { $XM = $bit;} if ($bit =~/MD/) {$MD = $bit;}} #my $XM = $line[15]; #my $MD = $line[18]; #print "$seq, $XM, $MD\n"; my @MD = split (/\:/, $MD); #last section of this is alphanumeric details of mismatches my $missmatch = $MD[2]; my @missmatch = (); @missmatch = split ('', $missmatch); my $misssize = scalar @missmatch; if ($misssize >0) { # do sums because there are missmatches..... my $smm = separate_alphanumeric ($missmatch); #print "$smm\n"; my @smm = split (/\t/, $smm); foreach $bit (@smm) { $deam = 0; if ($bit =~/[0123456789]/) { #it's a number, and relates to the next letter member of the array $number = $bit; } else { # it's a letter, and we already have the associated number, so we can do our calculations $base = $bit; $total++; if ($number == 0) { $first++; $deam = check_deamination ($base, $seq, $number); $firstCT = $firstCT + $deam; } if ($number == 1) { $second++; $deam = check_deamination ($base, $seq, $number); $secondCT = $secondCT + $deam; } if ($number == 2) { $third++; $deam = check_deamination ($base, $seq, $number); $thirdCT = $thirdCT + $deam; } if ($number > 2) { $deam = check_deamination ($base, $seq, $number); $outCT = $outCT + $deam; } #if ($deam == 1) {print TEST "$line";} # to pull out only damaged reads # now add some function to record for each position along the alignment up to position 20, beginning at position 4 $deam = 0; $counter = 0; #to do the 3' end, we need to translate missmatch coordinates $rnumber = ($seqlength+1) - $number; while ($counter < 20) { $rcounter = ($seqlength+1) - $counter; if ($number == $counter) { $value = $outtots[$number]; $value++; splice (@outtots, $number, 1, $value); $deam = check_deamination ($base, $seq, $number); if ($deam == 1) { $value = $outcts[$number]; $value++; splice (@outcts, $number, 1, $value); } #update one of the 12 arrays $sbase = $seq[$number]; if ($base =~/C/) { if ($sbase =~/T/) { $value = $CT[$number]; $value++;splice (@CT, $number, 1, $value);} if ($sbase =~/A/) { $value = $CA[$number]; $value++;splice (@CA, $number, 1, $value);} if ($sbase =~/G/) { $value = $CG[$number]; $value++;splice (@CG, $number, 1, $value);} } if ($base =~/T/) { if ($sbase =~/A/) { $value = $TA[$number]; $value++;splice (@TA, $number, 1, $value);} if ($sbase =~/C/) { $value = $TC[$number]; $value++;splice (@TC, $number, 1, $value);} if ($sbase =~/G/) { $value = $TG[$number]; $value++;splice (@TG, $number, 1, $value);} } if ($base =~/G/) { if ($sbase =~/A/) { $value = $GA[$number]; $value++;splice (@GA, $number, 1, $value);} if ($sbase =~/T/) { $value = $GA[$number]; $value++;splice (@GT, $number, 1, $value);} if ($sbase =~/C/) { $value = $GA[$number]; $value++;splice (@GC, $number, 1, $value);} } if ($base =~/A/) { if ($sbase =~/T/) { $value = $AT[$number]; $value++;splice (@AT, $number, 1, $value);} if ($sbase =~/C/) { $value = $AC[$number]; $value++;splice (@AC, $number, 1, $value);} if ($sbase =~/G/) { $value = $AG[$number]; $value++;splice (@AG, $number, 1, $value);} } } if ($number == $rcounter) { $sbase = $seq[$number]; if ($base =~/C/) { if ($sbase =~/T/) { $value = $rCT[$counter]; $value++;splice (@rCT, $counter, 1, $value);} if ($sbase =~/A/) { $value = $rCA[$counter]; $value++;splice (@rCA, $counter, 1, $value);} if ($sbase =~/G/) { $value = $rCG[$counter]; $value++;splice (@rCG, $counter, 1, $value);} } if ($base =~/T/) { if ($sbase =~/A/) { $value = $rTA[$counter]; $value++;splice (@rTA, $counter, 1, $value);} if ($sbase =~/C/) { $value = $rTC[$counter]; $value++;splice (@rTC, $counter, 1, $value);} if ($sbase =~/G/) { $value = $rTG[$counter]; $value++;splice (@rTG, $counter, 1, $value);} } if ($base =~/G/) { if ($sbase =~/A/) { $value = $rGA[$counter]; $value++;splice (@rGA, $counter, 1, $value);} if ($sbase =~/T/) { $value = $rGA[$counter]; $value++;splice (@rGT, $counter, 1, $value);} if ($sbase =~/C/) { $value = $rGA[$counter]; $value++;splice (@rGC, $counter, 1, $value);} } if ($base =~/A/) { if ($sbase =~/T/) { $value = $rAT[$counter]; $value++;splice (@rAT, $counter, 1, $value);} if ($sbase =~/C/) { $value = $rAC[$counter]; $value++;splice (@rAC, $counter, 1, $value);} if ($sbase =~/G/) { $value = $rAG[$counter]; $value++;splice (@rAG, $counter, 1, $value);} } } $counter++; } } } } } #close TEST; #print "@rCT\n@CT"; print "total: $total\nfirst: $first\nsecond: $second\nthird: $third\nfirstCT: $firstCT\nsecondCT: $secondCT\nthirdCT: $thirdCT\noutCT: $outCT\n"; # some summaries # total CT changes at 5' as a proportion of the whole $seqlengths = join ("\t", @seqlengths); $seqcount = scalar @seqlengths; $mean = find_mean ($seqlengths); $total5CT = $firstCT+$secondCT+$thirdCT; $expectedCT = $outCT/($mean-3); $observed5CT = $total5CT/3; print "Expected CT per base: $expectedCT\nObservedCT at 5': $observed5CT\n"; # probability of a C $cprob = $C/($A+$T+$G+$C); $aprob = $A/($A+$T+$G+$C); $gprob = $G/($A+$T+$G+$C); $tprob = $T/($A+$T+$G+$C); # over hang percentages of C's converted to T $c_in_overhang = $seqcount*3*$cprob; $c_in_rest = $seqcount*($mean-3)*$cprob; $overhang_deam_percent = $total5CT/$c_in_overhang; $rest_deam_percent = $outCT/$c_in_rest; print "Reads: $seqcount\n%C = $cprob\nmean read length: $mean\n"; print "Overhang deamination %: $overhang_deam_percent\ninternal deamination %: $rest_deam_percent\n"; print "deaminations: @outcts\ntotals: @outtots\n"; $cpersite = $seqcount*$cprob; $apersite = $seqcount*$aprob; $gpersite = $seqcount*$gprob; $tpersite = $seqcount*$tprob; $z = 0; print TEST "5'\t\t\t\t\t\t\t\t\t\t\t\t\t3'\nCT\tCA\tCG\tAT\tAC\tAG\tTA\tTC\tTG\tGA\tGT\tGC\t\tCT\tCA\tCG\tAT\tAC\tAG\tTA\tTC\tTG\tGA\tGT\tGC\n"; foreach $bit (@outtots) { $bot = $outcts[$z]; $other = $bit-$bot; $ctpercent = $bot/$cpersite; $otherpercent = $other/$seqcount; $ct = $CT[$z]; $ca = $CA[$z]; $cg = $CG[$z]; $ctper = $ct/$cpersite; $caper = $ca/$cpersite; $cgper = $cg/$cpersite; $at = $AT[$z]; $ac = $AC[$z]; $ag = $AG[$z]; $atper = $at/$apersite; $acper = $ac/$apersite; $agper = $ag/$apersite; $ta = $TA[$z]; $tc = $TC[$z]; $tg = $TG[$z]; $taper = $ta/$tpersite; $tcper = $tc/$tpersite; $tgper = $tg/$tpersite; $ga = $GA[$z]; $gt = $GT[$z]; $gc = $GC[$z]; $gaper = $ga/$gpersite; $gtper = $gt/$gpersite; $gcper = $gc/$gpersite; $rct = $rCT[$z]; $rca = $rCA[$z]; $rcg = $rCG[$z]; $rctper = $rct/$cpersite; $rcaper = $rca/$cpersite; $rcgper = $rcg/$cpersite; $rat = $rAT[$z]; $rac = $rAC[$z]; $rag = $rAG[$z]; $ratper = $rat/$apersite; $racper = $rac/$apersite; $ragper = $rag/$apersite; $rta = $rTA[$z]; $rtc = $rTC[$z]; $rtg = $rTG[$z]; $rtaper = $rta/$tpersite; $rtcper = $rtc/$tpersite; $rtgper = $rtg/$tpersite; $rga = $rGA[$z]; $rgt = $rGT[$z]; $rgc = $rGC[$z]; $rgaper = $rga/$gpersite; $rgtper = $rgt/$gpersite; $rgcper = $rgc/$gpersite; #print TEST "$ctpercent\t$otherpercent\t"; print TEST "$ctper\t$caper\t$cgper\t$atper\t$acper\t$agper\t$taper\t$tcper\t$tgper\t$gaper\t$gtper\t$gcper\t\t"; print TEST "$rctper\t$rcaper\t$rcgper\t$ratper\t$racper\t$ragper\t$rtaper\t$rtcper\t$rtgper\t$rgaper\t$rgtper\t$rgcper\n"; $z++; } close TEST; ######################### # SUBROUTINES # ######################### sub find_mean { my $list = $_[0]; my @list = (); my $x = 0; my $l = (); my $t = (); my $mean = (); @list = split (/\t/, $list); $l = scalar @list; if ($l == 0) { return $l;} foreach $t (@list) { $x = $x+$t; #print TEST "t: $t\n"; } if ($x == 0) {return $x;} $mean = $x/$l; return $mean; } sub single_tab { my $line = $_[0]; my $next = 0; do { if ($line =~/\t\t/) { $line =~s/\t\t/\t/g; } else { $next = 1; } } until ($next == 1); return $line; } sub convert_to_tab { my $line = $_[0]; $next = 0; do { if ($line =~/ /) { $line =~s/ / /g;} else {$next =1;} } until ($next=1); $line =~s/ /\t/g; return $line; } sub check_deamination { # routine to check for a C to T transition my $base = $_[0]; my $seq = $_[1]; my $number = $_[2]; my $result = 0; my @seq = split ('', $seq); my $refbase = $seq[$number]; if ($base =~/C/) { if ($refbase =~/T/) { $result = 1; } } return $result; } sub separate_alphanumeric { # splits numbers and letters into tab separation my $alphanum = $_[0]; my @alphanum = (); my @processed = (); my $bit = (); my $processed = (); @alphanum = split ('', $alphanum); my $on = 1; #always starts with a number, this is the 'number on' variable foreach $bit (@alphanum) { if ($on == 1) { if ($bit =~/[ATGCUNRYSWMKBDHV]/) { #covers IUPAC conventions # there'sbeen a change push (@processed, "\t", $bit); $on = 0; } else { push (@processed, $bit);} } else { #$done must already be 0, so we check for a change the other way if ($bit =~/[0123456789]/) { # change push (@processed, "\t", $bit); $on = 1; } else { push (@processed, $bit);} } } $processed = join ('', @processed); return $processed; }