# Examples from slides7 # For STAT8810 # Fall, 2017 # M.T. Pratola # Simple SGD Example on Linear Regression library(sgd) # offers various implementations of SGD set.seed(42) n = 1e4 d = 10 X = matrix(rnorm(n*d), ncol=d) theta = rep(5, d+1) eps = rnorm(n) y = cbind(1, X) %*% theta + eps dat = data.frame(y=y, x=X) fit.sgd = sgd(y ~ ., data=dat, model="lm") fit.lm = lm(y ~ ., data=dat) # Gradient for Gradient Descent Q.lm.grad<-function(theta,y,X) { n=length(y) -(2/n)*(t(y)-theta%*%t(X))%*%X } X=cbind(1,X) err=Inf theta.cg.hist=NULL theta.cg=rep(1,11) theta.cg.hist=cbind(theta.cg.hist,theta.cg) eta=0.1 n=length(y) # SGD Example while(err>1e-5) { gradient = Q.lm.grad(theta.cg,y,X) theta.cg = as.vector(theta.cg - eta*gradient) err = sum(gradient^2) theta.cg.hist=cbind(theta.cg.hist,theta.cg) } # Plot plot(fit.sgd$estimates[1,],fit.sgd$estimates[2,], type='l',lwd=2,xlab=expression(theta[1]), ylab=expression(theta[2]),xlim=c(0,6),ylim=c(0,6)) points(theta[1],theta[2],pch=20,cex=3,col="blue") points(fit.lm$coefficients[2],fit.lm$coefficients[3], pch=20,cex=1,col="green") lines(theta.cg.hist[2,],theta.cg.hist[3,],lwd=2, col="grey") # Try with a different seed to see how the increased variability of # the SGD procedure affects the optimization path. set.seed(7) n = 1e4 d = 10 X = matrix(rnorm(n*d), ncol=d) theta = rep(5, d+1) eps = rnorm(n) y = cbind(1, X) %*% theta + eps dat = data.frame(y=y, x=X) fit.sgd = sgd(y ~ ., data=dat, model="lm") fit.lm = lm(y ~ ., data=dat) X=cbind(1,X) err=Inf theta.cg.hist=NULL theta.cg=rep(1,11) theta.cg.hist=cbind(theta.cg.hist,theta.cg) eta=0.1 n=length(y) while(err>1e-5) { gradient = Q.lm.grad(theta.cg,y,X) theta.cg = as.vector(theta.cg - eta*gradient) err = sum(gradient^2) theta.cg.hist=cbind(theta.cg.hist,theta.cg) } plot(fit.sgd$estimates[1,],fit.sgd$estimates[2,], type='l',lwd=2,xlab=expression(theta[1]), ylab=expression(theta[2]),xlim=c(0,6),ylim=c(0,6)) points(theta[1],theta[2],pch=20,cex=3,col="blue") points(fit.lm$coefficients[2],fit.lm$coefficients[3], pch=20,cex=1,col="green") lines(theta.cg.hist[2,],theta.cg.hist[3,],lwd=2, col="grey") # Number of iterations for SGD algorithm: max(fit.sgd$pos) # "Effective" number of iterations for CG algorithm: ncol(theta.cg.hist)*n