# STATS 4CI3/6CI3 Winter 2019

# R code for solutions to Assignment 1

# Question 1(a)
sum.geom <- function(x,r,n) {
  #
  # Function to give finite or infinite sums of
  # a geometric series. 
  # Arguments:
  #  x: the first term in the series
  #  r: the ratio between successive terms
  #  n: the number of terms can be a positive integer for
  #     finite sums or Inf for infinite sums.
  #
  if (n<=0) stop("n must be postive")
  if (x==0) return(0)
  if (r==0) return(x)
  if (is.finite(n)) {
    if (n!=floor(n)) {
      n <- floor(n)
      warning(paste("Non-integer n rounded down to",n))
    }
    if (r != 1) return(x*(1-r^n)/(1-r))
    else return(n*x)
  }
  else {
    if (r >= 1)
      return(Inf)
    else if (r <= -1) 
      return(NaN)
    else
      return(x/(1-r))
  }
}

# Question 1(b)
choose.rec <- function(n,r) {
  #
  # Recursive function to calculate the number of ways to
  # choose r objects from n
  #
  if (n != floor(n)) {
    warning("non-integer n rounded down")
    n <- floor(n)
  }
  if (r != floor(r)) {
    warning("non-integer r rounded down")
    r <- floor(r)
  }
  if (n==0) stop("n must be greater than 0")
  if (n<r) stop("n must be greater than or equal to r")
  if ((n==r)||(r==0)) return(1)
  else
    choose.rec(n-1,r-1)+choose.rec(n-1,r)
}

# Question 2(a)
pooled.t.test <- function(x,y, alpha=0.05, alternative="two-sided") {
  # Function to do a pooled t-test of equality of means based
  # on two independent samples
  # Arguments:
  #   x,y - two numeric vectors corresponding to the 2 samples
  #   alpha - significance level (default 0.05)
  #   alternative - alternative hypothesis, default is "two-sided"
  #      other possible values are "less" and "greater"
  #
  n1 <- length(x)
  xbar <- mean(x)
  s2x <- var(x)
  n2 <- length(y)
  ybar <- mean(y)
  s2y <- var(y)
  #  Calculate the pooled estimate of the common variance
  s2p <- ((n1-1)*s2x+(n2-1)*s2y)/(n1+n2-2)
  #  Now find the test statistic
  stat <- (xbar-ybar)/sqrt(s2p*(1/n1+1/n2))
  #  Next find the p-value (which depends on the alternative)
  if (alternative=="two-sided")
    pv <- 2*pt(abs(stat), n1+n2-2, lower=FALSE)
  else if (alternative=="less")
    pv <- pt(stat, n1+n2-2)
  else if (alternative=="greater")
    pv <- pt(stat, n1+n2-2, lower=FALSE)
  else
    pv <- NA
  reject <- pv<alpha
  #  Print out some results
  cat(paste("Test Statistic =", round(stat,4), "\n"))
  if (!is.na(pv)) {
    if (alternative=="two-sided")
      cat("Alternative hypothesis: mu(X) does not equal mu(Y)\n")
    else if (alternative=="greater")
      cat("Alternative hypothesis: mu(X) is greater than mu(Y)\n")
    else cat("Alternative hypothesis: mu(X) is greater than mu(Y)\n")
    cat(paste("P-value =", format(pv,digits=4), "\n"))
    cat(paste(ifelse(reject, "Reject", "Do not reject"),
              "the null hypothesis at the alpha =",alpha,"level\n"))
  }
  else cat(paste("Cannot calculate p-value, alternative was",
                 alternative, "but should be \"two-sided\",
                 \"less\" or \"greater\"\n"))
  #  Return results invisibly because we have already printed out a summary.
  output <- list(means=c(xbar,ybar), vars=c(s2x,s2y),
                 pooled.s2=s2p, stat=stat, pv=pv)
  invisible(output)
}

# Question 2(b)
forward.select <- function(y, X, alpha=0.05) {
  # Function to do forward selection
  # Arguments:
  #   y - a quantitative response vector
  #   X - a matrix (or data.frame) of covariates
  #   alpha - a maximum significance level for inclusion

  Stage <- 1
  p <- ncol(X)
  Vars <- 1:p
  if (is.null(names(X))) 
    names <- paste("X",1:p,sep="")
  else
    names <- names(X)
  names.all <- names
  Data.model <- data.frame(y=y)
  Results <- list(Stage1=matrix(NA, ncol=5, nrow=p))
  # The first Stage; Examine each column of X
  for (i in 1:p){
    fit <- coef(summary(lm(y~X[,i])))
    Results$Stage1[i,] <- c(i,fit[2,])
  }
  # Sort the Results by p-value
  Results$Stage1 <- Results$Stage1[order(Results$Stage1[,5]),]
  if (Results$Stage1[1,5]>=alpha)
    return(model=lm(y~1), Results=Results$Stage1)
  # Now the first is the added variable provided we did 
  # not exit the function at the last line.
  v.add <- Results$Stage1[1,1]
  k <- 1
  continue <- T
  # Now we use a while loop for the remaining stages
  while(continue){
    # The current data for the model is set up 
    # Also we update the potential variables to be added
    names.mod <- c(names(Data.model), names.all[v.add])
    Data.model <- cbind(Data.model,X[,v.add])
    names(Data.model) <- names.mod
    names <- names[Vars!=v.add]
    Vars <- Vars[Vars!=v.add]
    k <- k+1 # The Stage Number
    j <- 0 # A counter for the variables tried at this stage
    Stage <- paste("Stage",k,sep="")
    Results[[Stage]] <- matrix(NA, ncol=5, 
                               nrow=length(Vars))
    for (i in Vars){
      # Loop over the potential variables
      j <- j+1
      Data.fit <- cbind(Data.model,X[,i])
      fit <- coef(summary(lm(y~., data=Data.fit)))
      Results[[Stage]][j,] <- c(i,fit[k+1,])
    }
    # Order the results for this stage and select
    # the variable with the smallest p-value
    Results[[Stage]] <- Results[[Stage]][order(Results[[Stage]][,5]),]
    v.add <- Results[[Stage]][1,1]
    # Decide if any variable will be added
    continue <- Results[[Stage]][1,5] < alpha
  }
  # Now summarize all of the results
  Res <- t(sapply(Results, function(x) x[1,])[,-k])
  v.add <- names.all[Res[,1]]
  Summary <- data.frame(v.add, Res)
  names(Summary) <- c("Variable", "Index", "Estimate",
                      "Std Error", "t stat", "p-value")
  row.names(Summary) <- paste("Stage",1:nrow(Res))
  # Get the final fitted model and print it out.
  fit <- lm(y~., data=Data.model)
  cat("Final Fitted Model is:\n")
  print(fit)
  # Return the model summary of stages and full results.
  invisible(list(model=fit, Summary=Summary, 
                 Results=Results))
}

# Question 3(b)
dice.gen <- function(n) {
  # Function to generate n random observations of the total in a
  # roll of two fair dice.
  u <- runif(n)
  x <- rep(7,n)
  x[u<=15/36] <- 6
  x[u<=10/36] <- 5
  x[u<=6/36] <- 4
  x[u<=3/36] <- 3
  x[u<=1/36] <- 2
  x[u>21/36] <- 8
  x[u>26/36] <- 9
  x[u>30/36] <- 10
  x[u>33/36] <- 11
  x[u>35/36] <- 12
  x
}

set.seed(1022019)
dice.out <- dice.gen(360000)
rbind(table(dice.out),10000*c(1:6,5:1))


# Question 4(a)
box.muller <- function(n, mu=0, sigma=1) {
  #
  # Implementation of the Box-Muller algorithm to
  # generate normal random variates.
  # 
  # Arguments:
  # n - required sample size
  # mu - mean (default 0)
  # sigma - the standard deviation (default 1)
  #
  if ((n>0) && (n%%2==0)) m <- n/2
  else if ((n>0) && (n%%2==1)) m <- (n+1)/2
  else stop("n must be a postive integer")
  U1 <- runif(m)
  U2 <- runif(m)
  temp <- sqrt(-2*log(U1))
  Y1 <- temp*sin(2*pi*U2)
  Y2 <- temp*cos(2*pi*U2)
  mu + sigma*c(Y1,Y2)[1:n]
}

# Exercise 4(b)
rand.geom <- function(n, p) {
  #
  # Function to generate a random sample from the geometric distribution.
  # Arguments:
  #   n: the required sample size
  #   p: the success probability of the geometric distribution
  #
  if ((p<=0) | (p>=1))
    stop("Success probability must be in the interval (0,1)")
  if (n<1)
    stop("n must be a positive integer")
  U <- runif(n)
  floor(log(U)/log(1-p))
}

# Exercise 5(a)
rchi.1 <- function(n, r) {
  # Function to generate chi-squared random variables with
  # even degrees of freedom.
  # The arguments are the required sample size and the
  # degrees of freedom.
  if (r%%2 != 0)
      stop("Only even degrees of freedom allowed")
  U <- matrix(runif(r/2*n), nrow=n)
  X <- -2*log(U)
  rowSums(X)
}

# Question 5(b)
rchi.2 <- function(n, r) {
  #Function to generate chi-squared random variables with
  # any degrees of freedom.
  # The arguments are the required sample size and the
  # degrees of freedom.
  Z <- matrix(box.muller(n*r), nrow=n)
  rowSums(Z^2)
}

# Question 5(c)
rchisq.nc <- function(n, k, lambda) {
  #
  #  Function to generate from a non-central chi-squared 
  #  distribution with degrees of freedom k and non-centrality
  #  parameter lambda.
  #
  #  Arguments
  #    n: number of random variates to generate
  #    k: degrees of freedom
  #    lambda: non-centrality parameter
  #
  
  # Ensure that k is positive
  if (k<=0) error("Degrees of Freedom must be positive")
  # Make sure the non-centrality parameter is non-negative.
  if (lambda < 0) 
    error("Non-centrality parameter cannot be negative")
  # Generate the Poisson random variates
  Y <- rpois(n, lambda/2)
  # Now generate from the appropriate central chi-squared
  # distributions. Note that here we are using the fact that
  # rchisq is vectorized in its parameters.
  rchisq(n, k+2*Y)
}