# R code to compare gene libraries # # Based on Libshuff procedure # Singleton, DR, Furlong, MA, Rathburn SL and # Whitman WB (2001), Quantitative Comparisons of # 16S rRNA Gene Sequence Libraries from Environmental # Samples. # Applied and Environmental Microbiology, 67, 4374-4376. # # e.g. genelib.comp(86,34,1000,"C_T0vT5.fasta") genelib.comp <- function(no_seq,no_seqX,no_rand,datapath) { # Input total numbers of sequences and number in group X # and number of randomisations required no_seqX1 <- no_seqX+1 no_seqY <- no_seq-no_seqX # Read data in fasta format dna_dat <- scan(datapath,what="character",sep=">",quote="\n") tot_lines <- length(dna_dat) seq_lines <- tot_lines/no_seq # Concatenate rows together seq_dat <- character(no_seq) for (i in 1:no_seq){ seq_start <- (i-1)*seq_lines+1+2 seq_end <- seq_start+seq_lines-3 seq_text <- seq(seq_start,seq_end,by=1) seq_dat[i] <- paste(dna_dat[seq_text],sep="",collapse="") } # Length of sequences seq_leng <- nchar(seq_dat[1]) bases_dat <- matrix(NA,nrow=no_seq,ncol=seq_leng) for (j in 1:no_seq){ bases_dat[j,] <- unlist(strsplit(seq_dat[j],split="")) } # Distance matrix dist_mat <- matrix(NA,nrow=no_seq,ncol=no_seq) for (r in 1:no_seq){ for (c in 1:no_seq){ comp_seq <- bases_dat[c,] prime_seq <- bases_dat[r,] comp_seq[which(comp_seq=="-")] <- "+" base_match <- sum(as.numeric(prime_seq == comp_seq)) prime_seq[which(prime_seq != "-")] <- "A" comp_seq[which(comp_seq != "+")] <- "A" compseq_leng <- sum(as.numeric(prime_seq == comp_seq)) uncorrect <- 1-(base_match/compseq_leng) if (uncorrect>0.75) dist_mat[r,c] <- 1 else dist_mat[r,c] <- (-3/4)*log(1-(4/3)*uncorrect) } } # Homologous coverage curves # Homologous coverage curve Cx Xdist <- dist_mat[1:no_seqX,1:no_seqX] min_Xdist <- vector(length=no_seqX) diag(Xdist) <- Inf for (nX in 1:no_seqX){ min_Xdist[nX] <- min(Xdist[nX,]) } max_disX <- ceiling(max(min_Xdist)+0.5) Cx_hist <- hist(min_Xdist,breaks=c(seq(-0.01,0.5,by=0.01),max_disX), plot=F,right=T,include.lowest=T) Cx_stat <- cumsum(Cx_hist$counts[1:51]/no_seqX) # Homologous coverage curve Cy Ydist <- dist_mat[no_seqX1:no_seq,no_seqX1:no_seq] min_Ydist <- vector(length=no_seqY) diag(Ydist) <- Inf for (nY in 1:no_seqY){ min_Ydist[nY] <- min(Ydist[nY,]) } max_disY <- ceiling(max(min_Ydist)+0.5) Cy_hist <- hist(min_Ydist,breaks=c(seq(-0.01,0.5,by=0.01),max_disY), plot=F,right=T,include.lowest=T) Cy_stat <- cumsum(Cy_hist$counts[1:51]/no_seqY) # Heterologous coverage curve Cyx YXdist <- dist_mat[no_seqX1:no_seq,1:no_seqX] min_YXdist <- vector(length=no_seqY) diag(YXdist) <- Inf for (nY in 1:no_seqY){ min_YXdist[nY] <- min(YXdist[nY,]) } max_disYX <- ceiling(max(min_YXdist)+0.5) Cyx_hist <- hist(min_YXdist,breaks=c(seq(-0.01,0.5,by=0.01),max_disYX), plot=F,right=T,include.lowest=T) Cyx_stat <- cumsum(Cyx_hist$counts[1:51]/no_seqY) # Heterologous coverage curve Cxy XYdist <- dist_mat[1:no_seqX,no_seqX1:no_seq] min_XYdist <- vector(length=no_seqX) diag(XYdist) <- Inf for (nX in 1:no_seqX){ min_XYdist[nX] <- min(XYdist[nX,]) } max_disXY <- ceiling(max(min_XYdist)+0.5) Cxy_hist <- hist(min_XYdist,breaks=c(seq(-0.01,0.5,by=0.01),max_disXY), plot=F,right=T,include.lowest=T) Cxy_stat <- cumsum(Cxy_hist$counts[1:51]/no_seqX) # Cramer-von Mises Statistic DCxy_stat <- sum((Cx_stat-Cxy_stat)^2) DCyx_stat <- sum((Cy_stat-Cyx_stat)^2) # Random shuffling DCxy <- vector(length=no_rand) DCyx <- vector(length=no_rand) errorDCyx <- matrix(NA,nrow=no_rand,ncol=51) errorDCxy <- matrix(NA,nrow=no_rand,ncol=51) for (nran in 1:no_rand){ # Select random subsets of data rand <- sample(no_seq,no_seq, replace=F) randX <- sort(rand[1:no_seqX]) randY <- sort(rand[no_seqX1:no_seq]) # Homologous coverage curve Cx Xdist <- dist_mat[randX,randX] min_Xdist <- vector(length=no_seqX) diag(Xdist) <- Inf for (nX in 1:no_seqX){ min_Xdist[nX] <- min(Xdist[nX,]) } max_disX <- ceiling(max(min_Xdist)+0.5) Cx_hist <- hist(min_Xdist,breaks=c(seq(-0.01,0.5,by=0.01),max_disX), plot=F,right=T,include.lowest=T) Cx <- cumsum(Cx_hist$counts[1:51]/no_seqX) # Homologous coverage curve Cy Ydist <- dist_mat[randY,randY] min_Ydist <- vector(length=no_seqY) diag(Ydist) <- Inf for (nY in 1:no_seqY){ min_Ydist[nY] <- min(Ydist[nY,]) } max_disY <- ceiling(max(min_Ydist)+0.5) Cy_hist <- hist(min_Ydist,breaks=c(seq(-0.01,0.5,by=0.01),max_disY), plot=F,right=T,include.lowest=T) Cy <- cumsum(Cy_hist$counts[1:51]/no_seqY) # Heterologous coverage curve Cyx YXdist <- dist_mat[randY,randX] min_YXdist <- vector(length=no_seqY) diag(YXdist) <- Inf for (nY in 1:no_seqY){ min_YXdist[nY] <- min(YXdist[nY,]) } max_disYX <- ceiling(max(min_YXdist)+0.5) Cyx_hist <- hist(min_YXdist,breaks=c(seq(-0.01,0.5,by=0.01),max_disYX), plot=F,right=T,include.lowest=T) Cyx <- cumsum(Cyx_hist$counts[1:51]/no_seqY) # Heterologous coverage curve Cxy XYdist <- dist_mat[randX,randY] min_XYdist <- vector(length=no_seqX) diag(XYdist) <- Inf for (nX in 1:no_seqX){ min_XYdist[nX] <- min(XYdist[nX,]) } max_disXY <- ceiling(max(min_XYdist)+0.5) Cxy_hist <- hist(min_XYdist,breaks=c(seq(-0.01,0.5,by=0.01),max_disXY), plot=F,right=T,include.lowest=T) Cxy <- cumsum(Cxy_hist$counts[1:51]/no_seqX) # Data for determination of error curves errorDCxy[nran,] <- (Cx-Cxy)^2 errorDCyx[nran,] <- (Cy-Cyx)^2 # Cramer-von Mises Tests DCxy[nran] <- sum((Cx-Cxy)^2) DCyx[nran] <- sum((Cy-Cyx)^2) } # Calculate p-values DCyx_sort <- sort(DCyx,decreasing=T) DCxy_sort <- sort(DCxy,decreasing=T) pCyx <- which.min((DCyx_stat-DCyx_sort)^2)/no_rand pCxy <- which.min((DCxy_stat-DCxy_sort)^2)/no_rand # Errors (95%) for squared differences point95 <- no_rand-round((95/100)*no_rand) DCxy_95 <- DCxy_sort[point95] DCyx_95 <- DCyx_sort[point95] pos_DCxy <- match(DCxy_95,DCxy) pos_DCyx <- match(DCyx_95,DCyx) DCxy_error <- errorDCxy[pos_DCxy,] DCyx_error <- errorDCyx[pos_DCyx,] DCxy_diff <- (Cx_stat-Cxy_stat)^2 DCyx_diff <- (Cy_stat-Cyx_stat)^2 rm("errorDCxy","errorDCyx") # Plots # Cx and Cxy split.screen(c(1,2)) screen(1) par(font.sub=3,cex.sub=0.7) plot(x=seq(0,0.5,by=0.01),y=Cx_stat,xlab="Evolutionary Distance", ylab="Coverage",xlim=c(0,0.5),ylim=c(0,1),type="l",las=1) sub_labxy <- paste("(Cramer-von Mises = ",as.character(round(DCxy_stat,4)), " : p-value = ",as.character(round(pCxy,5)),")") title(main=expression(paste(C[X]," and ",C[XY])),sub=sub_labxy) points(x=seq(0,0.5,by=0.01),y=Cx_stat,pch=19,col=1) lines(x=seq(0,0.5,by=0.01),y=Cxy_stat) points(x=seq(0,0.5,by=0.01),y=Cxy_stat,pch=1,col=1) axis(4,at=c(0,0.2,0.4,0.6,0.8,1.0), las=1,labels=c("0.00","0.04","0.08","0.12","0.16","0.20"),hadj=0.25) text(0.48,0.5,expression((C[X]-C[XY])^2),srt=90) lines(x=seq(0,0.5,by=0.01),5*DCxy_diff,type="l",col=2) lines(x=seq(0,0.5,by=0.01),5*DCxy_error,type="l",lty=2,col=2) # Cy and Cyx screen(2) par(font.sub=3,cex.sub=0.7) plot(x=seq(0,0.5,by=0.01),y=Cy_stat,xlab="Evolutionary Distance", ylab="",xlim=c(0,0.5),ylim=c(0,1),type="l",las=1) sub_labyx <- paste("(Cramer-von Mises = ",as.character(round(DCyx_stat,4)), " : p-value = ",as.character(round(pCyx,5)),")") title(main=expression(paste(C[Y]," and ",C[YX])),sub=sub_labyx) points(x=seq(0,0.5,by=0.01),y=Cy_stat,pch=19,col=1) lines(x=seq(0,0.5,by=0.01),y=Cyx_stat) points(x=seq(0,0.5,by=0.01),y=Cyx_stat,pch=1,col=1) text(0.48,0.5,expression((C[Y]-C[YX])^2),srt=90) lines(x=seq(0,0.5,by=0.01),5*DCyx_diff,type="l",col=2) lines(x=seq(0,0.5,by=0.01),5*DCyx_error,type="l",lty=2,col=2) # Print output cat("Delta-Cxy = ",round(DCxy_stat,4)," : p-value = ",round(pCxy,5),"\n") cat("Delta-Cyx = ",round(DCyx_stat,4)," : p-value = ",round(pCyx,5),"\n") # Output # list(Cx=Cx_stat,Cy=Cy_stat,Cxy=Cxy_stat,Cyx=Cyx_stat,dist_mat=dist_mat) }