#Artificial neural network #To implement: ### Momentum ### Replace the sigmoid function with the "positive-part" funtion (Rectified Linear units) ### Dropout load("mnist.small.RData") train.X <- train.X/255 test.X <- test.X/255 dimensions <- c(dim(train.X)[2], 100, #3 hidden layers 100, 100, nClasses) initial.matrix <- function(i,j) matrix(runif(i*j,-0.1,0.1),i,j) W12 <- initial.matrix(dimensions[1],dimensions[2]) W23 <- initial.matrix(dimensions[2],dimensions[3]) W34 <- initial.matrix(dimensions[3],dimensions[4]) W45 <- initial.matrix(dimensions[4],dimensions[5]) B2 <- matrix(0,1,dimensions[2]) B3 <- matrix(0,1,dimensions[3]) B4 <- matrix(0,1,dimensions[4]) B5 <- matrix(0,1,dimensions[5]) ##Feed-forward functions sigmoid <- function(x) { 1/(1+exp(-x)) } softmax <- function (x) { x <- x-apply(x,1,max) # (subtract row sums: better numerically) x <- (x)*(x > -30) + (-30)*(x <= -30) x <- exp(x) x <- x/apply(x,1,sum) return (x) } ##Nonlinearity is either tanh or softmax forward <- function(X,W,B,Nonlinearity) Nonlinearity( X%*%W + B[rep(1,dim(X)[1]),] ) ##Backprop functions ##Apply the chain rule dIdentity <- function(x) 1 dSigmoid <- function(y) y*(1-y) back.propagate <- function(X.down,X.up,deltaX.up,W,dNonlinearity) { ##X.up=Nonlinearity(A), A = X.down %*% W + B deltaA <- deltaX.up * dNonlinearity(X.up) deltaW <- t(X.down) %*% deltaA deltaB <- apply(deltaA,2,sum) deltaX.down <- deltaA %*% t(W) return (list(B=deltaB,W=deltaW,X=deltaX.down)) } errors <- function(X,Y) return(100*sum(Y!=(X==apply(X,1,max)))/(2*dim(X)[1])) batch.size <- 100 learning.rate <- 0.1 print("epochs, train%, test%") for (i in 1:10000000) { train.errors=c() for (ii in 1:(dim(train.X)[1]/batch.size)) { learning.rate <- learning.rate * 0.999995 indices <- sample(dim(train.X)[1], batch.size) Y <- train.Y[indices,] X1 <- train.X[indices,] X2 <- forward(X1, W12, B2, sigmoid) X3 <- forward(X2, W23, B3, sigmoid) X4 <- forward(X3, W34, B4, sigmoid) X5 <- forward(X4, W45, B5, softmax) train.errors <- c(train.errors,errors(X5,Y)) delta.A5 <- (X5-Y)/batch.size delta4 <- back.propagate(X4,X5,delta.A5,W45,dIdentity) delta3 <- back.propagate(X3,X4,delta4$X,W34,dSigmoid) delta2 <- back.propagate(X2,X3,delta3$X,W23,dSigmoid) delta1 <- back.propagate(X1,X2,delta2$X,W12,dSigmoid) W12 <- W12 - learning.rate * delta1$W W23 <- W23 - learning.rate * delta2$W W34 <- W34 - learning.rate * delta3$W W45 <- W45 - learning.rate * delta4$W B2 <- B2 - learning.rate * delta1$B B3 <- B3 - learning.rate * delta2$B B4 <- B4 - learning.rate * delta3$B B5 <- B5 - learning.rate * delta4$B } X1 <- test.X X2 <- forward(X1, W12, B2, sigmoid) X3 <- forward(X2, W23, B3, sigmoid) X4 <- forward(X3, W34, B4, sigmoid) X5 <- forward(X4, W45, B5, softmax) test.error <- errors(X5, test.Y) print (c(i, mean(train.errors), test.error)) }