--- title: "350.wk2.bootstrap" author: "William Revelle" date: "`r Sys.Date()`" output: html_document: default pdf_document: default --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) options(width=100) ``` In the lecture notes, we discussed random sampling as a way to find confidence intervals. Here I unpack those functions that so that we can explore how functions work. ## First the prelimaries of making some packages active ```{r} library(psych) #necessary for describe library(psychTools) #necessary for Galton ``` # the two original functions ```{r} small <- function(data=NULL,sample.size=20, n.iter=1000) { nsub <- nrow(data) #this figures out the sampe size dynamically result <- rep(NA,n.iter) #create this vector #use a for loop to repeat the code inside the { } for(i in 1:n.iter) { #repeat some code samp <- sample(nsub,sample.size) result[i] <- cor(data[samp,])[1,2] #find the correlation for this sample and save it } #end of the loop return(result) #return the value we find } #end of function #Our bootstrap function boot <- function(data=NULL,sample.size=20, n.iter=1000) { nsub <- nrow(data) #this figures out the sampe size dynamically result <- rep(NA,n.iter) #create this vector #use a for loop to repeat the code inside the { } for(i in 1:n.iter) { #repeat some code samp <- sample(nsub,sample.size,replace=TRUE) #sample with repl result[i] <- cor(data[samp,])[1,2] #find the correlation for this sample and save it } #end of the loop return(result) #return the value we find } #end of function ``` # These two functions were tailored made for the Galton data set But functions can be more useful if they are general. We modify the function to save the results as list instead of a vector, and we allow it to sample with or without replacement. ```{r} new.boot <- function(data=NULL,sample.size=20, n.iter=1000, boot=TRUE, use="pairwise") { nsub <- nrow(data) #this figures out the sampe size dynamically result <- list() #we will save the data as a list #use a for loop to repeat the code inside the { } for(i in 1:n.iter) { #repeat some code samp <- sample(nsub,sample.size,replace=boot) #sample with replacement or not result[[i]] <- cor(data[samp,],use=use) #find the correlation for this sample and save it } #end of the loop return(result) #return the value we find } #end of function ``` # Now, we can run this function on different data sets Try the six variables of `sat.act' ```{r} sat.act.boot <- new.boot(sat.act,n.iter=40,sample.size=50) length(sat.act.boot) #how long is the list? sat.act.boot[[1]] #show the first element of the list temp <- matrix(unlist(sat.act.boot), ncol=36, byrow=TRUE) describe(temp) ``` ## our new.boot is not super efficient Because the correlations are symmetric, and the diagonal is always =1, we can pick just the lower diagonal elements Here is a somewhat better new function ```{r} newer.boot <- function(data=NULL,sample.size=20, n.iter=1000, boot=TRUE, use="pairwise") { nsub <- nrow(data) #this figures out the sampe size dynamically result <- list() #we will save the data as a list #use a for loop to repeat the code inside the { } for(i in 1:n.iter) { #repeat some code samp <- sample(nsub,sample.size,replace=boot) #sample with replacement or not R <- cor(data[samp,],use=use) R.lower <- R[lower.tri(R)] #just grab the lower diagonal part result[[i]] <- R.lower #find the correlation for this sample and save it } #end of the loop return(result) #return the value we find } #end of function ``` newer.boot produces less ouput ```{r} sat.act.boot.1 <- newer.boot(sat.act,n.iter=50,sample.size=40) length(sat.act.boot.1) #still length 50 (for number of iterations) sat.act.boot.1[[1]] #just 15 elements temp <- matrix(unlist(sat.act.boot.1), ncol=15, byrow=TRUE) describe(temp) ``` ## Yet another improvement newer.boot returns a list which we need to process. Lets do it inside the function ```{r} newer.boot <- function(data=NULL,sample.size=20, n.iter=1000, boot=TRUE, use="pairwise") { nsub <- nrow(data) #this figures out the sampe size dynamically result <- list() #we will save the data as a list #use a for loop to repeat the code inside the { } for(i in 1:n.iter) { #repeat some code samp <- sample(nsub,sample.size,replace=boot) #sample with replacement or not R <- cor(data[samp,],use=use) R.lower <- R[lower.tri(R)] #just grab the lower diagonal part result[[i]] <- R.lower #find the correlation for this sample and save it } #end of the loop nc <- length(R.lower) #figure out how long each list item is new.result <- matrix(unlist(result),ncol=nc, byrow=TRUE) #do this inside the function return(new.result) #return the value we find } #end of function ``` ## Try this revised function on Galton ```{r} boot.20 <- newer.boot(galton,sample.size=20) boot.40 <- newer.boot(galton,sample.size=40) boot.80 <- newer.boot(galton,sample.size=80) boot.320 <- newer.boot(galton,sample.size=320) boot.640 <- newer.boot(galton,sample.size=640) boot.df <- data.frame(boot.20,boot.40,boot.80,boot.320,boot.640) describe(boot.df) ``` # Now for some fancy stuff -- name the variable pairs The output of newer.boot is a matrix where the columns are the various pairs of variables and the rows are the replications. Adapting code from the `corCi` function, we can correctly name the columns. ```{r} newer.boot <- function(data=NULL,sample.size=50, n.iter=1000, boot=TRUE, use="pairwise", minlength=5) { nsub <- nrow(data) #this figures out the sampe size dynamically result <- list() #we will save the data as a list #use a for loop to repeat the code insidetra the { } for(i in 1:n.iter) { #repeat some code samp <- sample(nsub,sample.size,replace=boot) #sample with replacement or not R <- cor(data[samp,],use=use) R.lower <- R[lower.tri(R)] #just grab the lower diagonal part result[[i]] <- R.lower #find the correlation for this sample and save it } #end of the loop nc <- length(R.lower) #figure out how long each list item is new.result <- matrix(unlist(result),ncol=nc, byrow=TRUE) #now name the columns using two loops nvar <- NCOL(data) cnR <- abbreviate(colnames(data), minlength = minlength) colnames(new.result)<- paste0("V",1:NCOL(new.result)) #this is a filler k <- 1 for (i in 1:(nvar - 1)) { for (j in (i + 1):nvar) { colnames(new.result)[k] <- paste(cnR[i], cnR[j], sep = "-") k <- k + 1 } } return(new.result) #return the value we find } #end of function ##now try it describe(newer.boot(sat.act)) describe(newer.boot(galton,160)) ``` # Using browser and debug to help you understand the code. Functions are sometimes hard to undertand. If you say debug(somefunction) and then run that function, it will step through each line, one at a time. Hitting return gets to the next line, c goes to the end of the loop. Try this on our newer.boot function.