#Set high memory limit to avoid R crashing halfway through #something important. memory.limit(1000) memory.limit() #Tell R where to find your data (ensure all files are contained #within this directory). setwd("C:/R teaching/CF data") getwd() #Rearrange data (.soft files and .annot files) into tab delimited #text format (*.txt), with column headings as the first row in Excel. #See Word document for further info. #Read in Data.txt file and Annot.txt file. Annot<-read.delim("Annot.txt") Data<-read.table("Data.txt",header=T,sep="\t",colClasses="character") #Convert data columns to numeric. for (i in 3:length(names(Data))){ Data[,i]<-as.numeric(Data[,i]) } #Check Data and Annot by printing first 10 rows. Data[1:10,] Annot[1:10,] #Calculate number of genes (i.e. the number of rows). NoGenes<-dim(Data)[1] NoGenes #Calculate number of chips (i.e. the number of columns - bear in mind there #are two annotation columns in the Data file). NoChips<-(dim(Data)[2])-2 NoChips #Create annotated gene tables by combining the annotation file with #the data files. AnnData<-cbind(Annot,Data) #Check annotated data structure by printing first 10 rows. AnnData[1:10,] #Find which columns contain data. DatCols<-names(AnnData[,17:length(AnnData)]) #Check for any NULL values in the data and replace with the value '0'. for (i in 1:length(DatCols)){ null<-which(AnnData$DatCols[i]=="NULL") AnnData$DatCols[null]<-0 } AnnData<-AnnData[,1:(length(AnnData)-1)] #Check for null values (should return "numeric(0)" for all columns). for (i in 1:length(DatCols)){ null<-which(AnnData$DatCols[i]=="NULL") print(null) } #Remove AFFY control genes (handily labelled as "CONTROL" in the #"Platform_SPOTID" column of AnnData. NonContGenes<-which(AnnData$Platform_SPOTID!="CONTROL") QCData<-AnnData[NonContGenes,] #Check data before normalisation by looking at various plots. Only use #columns with numerical data (adjust as needed). boxplot(QCData[,17:length(AnnData)],log="y",col=c(2:8)) LogData<-log(QCData[,17:length(AnnData)]) par(mfrow=c(4,4)) for (i in 1:length(LogData)){ label<-names(LogData)[i] hist(LogData[,i],angle=45,main=paste("Histogram of ", label)) } #Normalize data. Normalization is generally done per chip, and then per #gene. This allows comparison of different gene expressions over different #chips. The simplest way is to normalize by the mean/median of the chip/gene. #There are much better normalization methods available, and you #may wish to think of better methods for your project. Be aware that some #later analysis techniques require using non-normalized data. #MEDIAN - Cross Chip ChpNormCol<-array(0, c(NoGenes,NoChips)) for (i in 1:NoChips){ ChpMed<-median(AnnData[[DatCols[i]]],na.rm=TRUE) ChpNormCol[,i]<-AnnData[[DatCols[i]]]/ChpMed } #Some analyses require using per chip normalization only. ChpNorm<-cbind(AnnData[,1:16],ChpNormCol) #MEDIAN - Cross Gene GeneNormCol<-array(0, c(NoGenes,NoChips)) for (i in 1:NoGenes){ GeneMed<-median(ChpNormCol[i,1:NoChips]) GeneNormCol[i,]<-ChpNormCol[i,1:NoChips]/GeneMed } NormData<-cbind(AnnData[,1:16],GeneNormCol)