# Examples from slides7
# For STAT8810
# Fall, 2017
# M.T. Pratola


# Simple SGD Example on Linear Regression
library(sgd) # offers various implementations of SGD
set.seed(42)
n = 1e4
d = 10
X = matrix(rnorm(n*d), ncol=d)
theta = rep(5, d+1)
eps = rnorm(n)
y = cbind(1, X) %*% theta + eps
dat = data.frame(y=y, x=X)
fit.sgd = sgd(y ~ ., data=dat, model="lm")
fit.lm = lm(y ~ ., data=dat)


# Gradient for Gradient Descent
Q.lm.grad<-function(theta,y,X) {
    n=length(y)
    -(2/n)*(t(y)-theta%*%t(X))%*%X
}
X=cbind(1,X)
err=Inf
theta.cg.hist=NULL
theta.cg=rep(1,11)
theta.cg.hist=cbind(theta.cg.hist,theta.cg)
eta=0.1
n=length(y)


# SGD Example
while(err>1e-5)
{
    gradient = Q.lm.grad(theta.cg,y,X)
    theta.cg = as.vector(theta.cg - eta*gradient)
    err = sum(gradient^2)
    theta.cg.hist=cbind(theta.cg.hist,theta.cg)
}


# Plot
plot(fit.sgd$estimates[1,],fit.sgd$estimates[2,],
    type='l',lwd=2,xlab=expression(theta[1]),
    ylab=expression(theta[2]),xlim=c(0,6),ylim=c(0,6))
points(theta[1],theta[2],pch=20,cex=3,col="blue")
points(fit.lm$coefficients[2],fit.lm$coefficients[3],
    pch=20,cex=1,col="green")
lines(theta.cg.hist[2,],theta.cg.hist[3,],lwd=2,
    col="grey")


# Try with a different seed to see how the increased variability of 
# the SGD procedure affects the optimization path.
set.seed(7)
n = 1e4
d = 10
X = matrix(rnorm(n*d), ncol=d)
theta = rep(5, d+1)
eps = rnorm(n)
y = cbind(1, X) %*% theta + eps
dat = data.frame(y=y, x=X)
fit.sgd = sgd(y ~ ., data=dat, model="lm")
fit.lm = lm(y ~ ., data=dat)

X=cbind(1,X)
err=Inf
theta.cg.hist=NULL
theta.cg=rep(1,11)
theta.cg.hist=cbind(theta.cg.hist,theta.cg)
eta=0.1
n=length(y)

while(err>1e-5)
{
    gradient = Q.lm.grad(theta.cg,y,X)
    theta.cg = as.vector(theta.cg - eta*gradient)
    err = sum(gradient^2)
    theta.cg.hist=cbind(theta.cg.hist,theta.cg)
}

plot(fit.sgd$estimates[1,],fit.sgd$estimates[2,],
    type='l',lwd=2,xlab=expression(theta[1]),
    ylab=expression(theta[2]),xlim=c(0,6),ylim=c(0,6))
points(theta[1],theta[2],pch=20,cex=3,col="blue")
points(fit.lm$coefficients[2],fit.lm$coefficients[3],
    pch=20,cex=1,col="green")
lines(theta.cg.hist[2,],theta.cg.hist[3,],lwd=2,
    col="grey")


# Number of iterations for SGD algorithm:
max(fit.sgd$pos)

# "Effective" number of iterations for CG algorithm:
ncol(theta.cg.hist)*n