---
title: "350.wk2.bootstrap"
author: "William Revelle"
date: "`r Sys.Date()`"
output:
  html_document: default
  pdf_document: default
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
options(width=100)
```

In the lecture notes, we discussed random sampling as a way to find confidence intervals.  Here I unpack those functions that so that we can explore how functions work.


## First the prelimaries of making some packages active

```{r}
library(psych)        #necessary for describe
library(psychTools)   #necessary for Galton

```

# the two original functions

```{r}
small <- function(data=NULL,sample.size=20, n.iter=1000)  {
   nsub <- nrow(data)    #this figures out the sampe size dynamically
    result <- rep(NA,n.iter) #create this vector
#use a for loop to repeat the code inside the { }
 for(i in 1:n.iter) { #repeat some code
     samp <- sample(nsub,sample.size)
    result[i] <- cor(data[samp,])[1,2]
         #find the correlation for this sample and save it
         }    #end of the loop
return(result)   #return the value we find
}   #end of function
#Our bootstrap function
boot <- function(data=NULL,sample.size=20, n.iter=1000)  {
  nsub <- nrow(data)    #this figures out the sampe size dynamically
  result <- rep(NA,n.iter) #create this vector
 #use a for loop to repeat the code inside the { }
  for(i in 1:n.iter) { #repeat some code
     samp <- sample(nsub,sample.size,replace=TRUE)   #sample with repl
    result[i] <- cor(data[samp,])[1,2]
    #find the correlation for this sample and save it
                      }    #end of the loop
return(result)   #return the value we find
}   #end of function
```

# These two functions were tailored made for the Galton data set

But functions can be more useful if they are general. We modify the function to save the results as list instead of a vector, and we allow it to sample with or without replacement.

```{r}
new.boot <- function(data=NULL,sample.size=20, n.iter=1000, boot=TRUE, use="pairwise")  {
  nsub <- nrow(data)    #this figures out the sampe size dynamically
  result <- list()   #we will save the data as a list
 #use a for loop to repeat the code inside the { }
  for(i in 1:n.iter) { #repeat some code
    samp <- sample(nsub,sample.size,replace=boot)    #sample with replacement or not
    result[[i]] <- cor(data[samp,],use=use)   
    #find the correlation for this sample and save it
                      }    #end of the loop
return(result)   #return the value we find
}   #end of function

```

# Now, we can run this function on different data sets

Try the six variables of `sat.act'

```{r}
sat.act.boot <- new.boot(sat.act,n.iter=40,sample.size=50)
length(sat.act.boot) #how long is the list?
sat.act.boot[[1]] #show the first element of the list
temp <- matrix(unlist(sat.act.boot), ncol=36, byrow=TRUE)
describe(temp)
```

## our new.boot is not super efficient 

Because the correlations are symmetric, and the diagonal is always =1, we can pick just the lower diagonal elements


Here is a somewhat better new function

```{r}

newer.boot <- function(data=NULL,sample.size=20, n.iter=1000, boot=TRUE, use="pairwise")  {
  nsub <- nrow(data)    #this figures out the sampe size dynamically
  result <- list()   #we will save the data as a list
 #use a for loop to repeat the code inside the { }
  for(i in 1:n.iter) { #repeat some code
    samp <- sample(nsub,sample.size,replace=boot)    #sample with replacement or not
    R <-  cor(data[samp,],use=use)  
    R.lower <- R[lower.tri(R)]   #just grab the lower diagonal part
    result[[i]] <- R.lower 
    #find the correlation for this sample and save it
                      }    #end of the loop
return(result)   #return the value we find
}   #end of function

```

newer.boot produces less ouput

```{r}
sat.act.boot.1 <- newer.boot(sat.act,n.iter=50,sample.size=40)
length(sat.act.boot.1)  #still length 50 (for number of iterations)
sat.act.boot.1[[1]]  #just 15 elements
temp <- matrix(unlist(sat.act.boot.1), ncol=15, byrow=TRUE)
describe(temp)

```

## Yet another improvement

newer.boot returns a list which we need to process.
Lets do it inside the function

```{r}
newer.boot <- function(data=NULL,sample.size=20, n.iter=1000, boot=TRUE, use="pairwise")  {
  nsub <- nrow(data)    #this figures out the sampe size dynamically
  result <- list()   #we will save the data as a list
 #use a for loop to repeat the code inside the { }
  for(i in 1:n.iter) { #repeat some code
    samp <- sample(nsub,sample.size,replace=boot)    #sample with replacement or not
    R <-  cor(data[samp,],use=use)  
    R.lower <- R[lower.tri(R)]   #just grab the lower diagonal part
    result[[i]] <- R.lower 
    #find the correlation for this sample and save it
                      }    #end of the loop
nc <- length(R.lower) #figure out how long each list item is
new.result <- matrix(unlist(result),ncol=nc, byrow=TRUE)  #do this inside the function
return(new.result)   #return the value we find
}   #end of function

```

## Try this revised function on Galton

```{r}
boot.20 <- newer.boot(galton,sample.size=20)
boot.40 <- newer.boot(galton,sample.size=40)
boot.80 <- newer.boot(galton,sample.size=80)
boot.320 <- newer.boot(galton,sample.size=320)
boot.640 <- newer.boot(galton,sample.size=640)
boot.df <- data.frame(boot.20,boot.40,boot.80,boot.320,boot.640)
describe(boot.df)
```
# Now for some fancy stuff  -- name the variable pairs

The output of newer.boot is a matrix where the columns are the various pairs of  variables and the rows are the replications.  Adapting code from the `corCi` function, we can correctly name the columns.

```{r}

newer.boot <- function(data=NULL,sample.size=50, n.iter=1000, boot=TRUE, use="pairwise", minlength=5)  {
  nsub <- nrow(data)    #this figures out the sampe size dynamically
  result <- list()   #we will save the data as a list
 #use a for loop to repeat the code insidetra the { }
  for(i in 1:n.iter) { #repeat some code
    samp <- sample(nsub,sample.size,replace=boot)    #sample with replacement or not
    R <-  cor(data[samp,],use=use)  
    R.lower <- R[lower.tri(R)]   #just grab the lower diagonal part
    result[[i]] <- R.lower 
    #find the correlation for this sample and save it
                      }    #end of the loop
nc <- length(R.lower) #figure out how long each list item is
new.result <- matrix(unlist(result),ncol=nc, byrow=TRUE)

#now name the columns using two loops
nvar <- NCOL(data)
cnR <- abbreviate(colnames(data), minlength = minlength)
colnames(new.result)<- paste0("V",1:NCOL(new.result))  #this is a filler
k <- 1
        for (i in 1:(nvar - 1)) {
            for (j in (i + 1):nvar) {
                colnames(new.result)[k] <- paste(cnR[i], cnR[j], 
                  sep = "-")
                k <- k + 1
            }
        }
return(new.result)   #return the value we find
}   #end of function


##now try it

describe(newer.boot(sat.act))
describe(newer.boot(galton,160))
```

# Using browser and debug to help you understand the code.

Functions are sometimes hard to undertand.  If you say debug(somefunction) and then run that function, it will step through each line, one at a time.  Hitting return gets to the next line, c goes to the end of the loop.

Try this on our newer.boot function.