#Set high memory limit to avoid R crashing halfway through
#something important.

memory.limit(1000)
memory.limit()

#Tell R where to find your data (ensure all files are contained 
#within this directory).

setwd("C:/R teaching/CF data")
getwd()

#Rearrange data (.soft files and .annot files) into tab delimited 
#text format (*.txt), with column headings as the first row in Excel. 
#See Word document for further info.

#Read in Data.txt file and Annot.txt file. 

Annot<-read.delim("Annot.txt")
Data<-read.table("Data.txt",header=T,sep="\t",colClasses="character")

#Convert data columns to numeric.

for (i in 3:length(names(Data))){
Data[,i]<-as.numeric(Data[,i])
}

#Check Data and Annot by printing first 10 rows.

Data[1:10,]
Annot[1:10,]

#Calculate number of genes (i.e. the number of rows).

NoGenes<-dim(Data)[1]
NoGenes

#Calculate number of chips (i.e. the number of columns - bear in mind there
#are two annotation columns in the Data file).

NoChips<-(dim(Data)[2])-2
NoChips

#Create annotated gene tables by combining the annotation file with
#the data files.

AnnData<-cbind(Annot,Data)

#Check annotated data structure by printing first 10 rows.

AnnData[1:10,]

#Find which columns contain data.

DatCols<-names(AnnData[,17:length(AnnData)])

#Check for any NULL values in the data and replace with the value '0'. 

for (i in 1:length(DatCols)){
null<-which(AnnData$DatCols[i]=="NULL")
AnnData$DatCols[null]<-0
}
AnnData<-AnnData[,1:(length(AnnData)-1)]

#Check for null values (should return "numeric(0)" for all columns).

for (i in 1:length(DatCols)){
null<-which(AnnData$DatCols[i]=="NULL")
print(null)
}

#Remove AFFY control genes (handily labelled as "CONTROL" in the 
#"Platform_SPOTID" column of AnnData.

NonContGenes<-which(AnnData$Platform_SPOTID!="CONTROL")
QCData<-AnnData[NonContGenes,]

#Check data before normalisation by looking at various plots. Only use 
#columns with numerical data (adjust as needed).

boxplot(QCData[,17:length(AnnData)],log="y",col=c(2:8))

LogData<-log(QCData[,17:length(AnnData)])
par(mfrow=c(4,4))
for (i in 1:length(LogData)){
label<-names(LogData)[i]
hist(LogData[,i],angle=45,main=paste("Histogram of ", label))
}

#Normalize data. Normalization is generally done per chip, and then per 
#gene. This allows comparison of different gene expressions over different
#chips. The simplest way is to normalize by the mean/median of the chip/gene.
#There are much better normalization methods available, and you
#may wish to think of better methods for your project. Be aware that some
#later analysis techniques require using non-normalized data.

#MEDIAN - Cross Chip
ChpNormCol<-array(0, c(NoGenes,NoChips))
for (i in 1:NoChips){
ChpMed<-median(AnnData[[DatCols[i]]],na.rm=TRUE)
ChpNormCol[,i]<-AnnData[[DatCols[i]]]/ChpMed
}

#Some analyses require using per chip normalization only.
ChpNorm<-cbind(AnnData[,1:16],ChpNormCol)

#MEDIAN - Cross Gene
GeneNormCol<-array(0, c(NoGenes,NoChips))
for (i in 1:NoGenes){
GeneMed<-median(ChpNormCol[i,1:NoChips])
GeneNormCol[i,]<-ChpNormCol[i,1:NoChips]/GeneMed
}

NormData<-cbind(AnnData[,1:16],GeneNormCol)