In the lecture notes, we discussed random sampling as a way to find confidence intervals. Here I unpack those functions that so that we can explore how functions work.
library(psych) #necessary for describe
library(psychTools) #necessary for Galton
small <- function(data=NULL,sample.size=20, n.iter=1000) {
nsub <- nrow(data) #this figures out the sampe size dynamically
result <- rep(NA,n.iter) #create this vector
#use a for loop to repeat the code inside the { }
for(i in 1:n.iter) { #repeat some code
samp <- sample(nsub,sample.size)
result[i] <- cor(data[samp,])[1,2]
#find the correlation for this sample and save it
} #end of the loop
return(result) #return the value we find
} #end of function
#Our bootstrap function
boot <- function(data=NULL,sample.size=20, n.iter=1000) {
nsub <- nrow(data) #this figures out the sampe size dynamically
result <- rep(NA,n.iter) #create this vector
#use a for loop to repeat the code inside the { }
for(i in 1:n.iter) { #repeat some code
samp <- sample(nsub,sample.size,replace=TRUE) #sample with repl
result[i] <- cor(data[samp,])[1,2]
#find the correlation for this sample and save it
} #end of the loop
return(result) #return the value we find
} #end of function
But functions can be more useful if they are general. We modify the function to save the results as list instead of a vector, and we allow it to sample with or without replacement.
new.boot <- function(data=NULL,sample.size=20, n.iter=1000, boot=TRUE, use="pairwise") {
nsub <- nrow(data) #this figures out the sampe size dynamically
result <- list() #we will save the data as a list
#use a for loop to repeat the code inside the { }
for(i in 1:n.iter) { #repeat some code
samp <- sample(nsub,sample.size,replace=boot) #sample with replacement or not
result[[i]] <- cor(data[samp,],use=use)
#find the correlation for this sample and save it
} #end of the loop
return(result) #return the value we find
} #end of function
Try the six variables of `sat.act’
sat.act.boot <- new.boot(sat.act,n.iter=40,sample.size=50)
length(sat.act.boot) #how long is the list?
## [1] 40
sat.act.boot[[1]] #show the first element of the list
## gender education age ACT SATV SATQ
## gender 1.00000000 0.1208734 0.09104025 -0.2410253 -0.24541547 -0.33296936
## education 0.12087344 1.0000000 0.43462701 0.1546173 0.17144458 0.27737990
## age 0.09104025 0.4346270 1.00000000 -0.1683019 0.03591877 0.02865404
## ACT -0.24102533 0.1546173 -0.16830185 1.0000000 0.31524000 0.53262385
## SATV -0.24541547 0.1714446 0.03591877 0.3152400 1.00000000 0.74872893
## SATQ -0.33296936 0.2773799 0.02865404 0.5326239 0.74872893 1.00000000
temp <- matrix(unlist(sat.act.boot), ncol=36, byrow=TRUE)
describe(temp)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 40 1.00 0.00 1.00 1.00 0.00 1.00 1.00 0.00 -1.93 1.16 0.00
## X2 2 40 0.05 0.18 0.08 0.06 0.21 -0.38 0.30 0.69 -0.35 -0.80 0.03
## X3 3 40 -0.06 0.19 -0.05 -0.06 0.19 -0.53 0.32 0.85 -0.17 -0.40 0.03
## X4 4 40 -0.05 0.14 -0.06 -0.06 0.11 -0.32 0.38 0.70 0.58 0.95 0.02
## X5 5 40 0.00 0.12 0.00 0.00 0.11 -0.29 0.32 0.61 -0.08 0.29 0.02
## X6 6 40 -0.15 0.13 -0.18 -0.15 0.12 -0.39 0.09 0.48 0.17 -0.91 0.02
## X7 7 40 0.05 0.18 0.08 0.06 0.21 -0.38 0.30 0.69 -0.35 -0.80 0.03
## X8 8 40 1.00 0.00 1.00 1.00 0.00 1.00 1.00 0.00 -2.66 4.72 0.00
## X9 9 40 0.56 0.10 0.59 0.57 0.11 0.35 0.70 0.35 -0.61 -0.72 0.02
## X10 10 40 0.13 0.11 0.13 0.13 0.10 -0.11 0.39 0.50 0.31 0.17 0.02
## X11 11 40 0.01 0.17 0.01 0.01 0.17 -0.30 0.35 0.66 -0.04 -0.86 0.03
## X12 12 40 0.02 0.17 0.01 0.02 0.16 -0.38 0.31 0.69 -0.14 -0.65 0.03
## X13 13 40 -0.06 0.19 -0.05 -0.06 0.19 -0.53 0.32 0.85 -0.17 -0.40 0.03
## X14 14 40 0.56 0.10 0.59 0.57 0.11 0.35 0.70 0.35 -0.61 -0.72 0.02
## X15 15 40 1.00 0.00 1.00 1.00 0.00 1.00 1.00 0.00 -1.94 1.08 0.00
## X16 16 40 0.09 0.15 0.09 0.08 0.12 -0.19 0.51 0.70 0.39 0.62 0.02
## X17 17 40 -0.08 0.14 -0.07 -0.08 0.15 -0.32 0.28 0.60 0.12 -0.49 0.02
## X18 18 40 -0.05 0.17 -0.07 -0.05 0.16 -0.37 0.40 0.77 0.30 -0.24 0.03
## X19 19 40 -0.05 0.14 -0.06 -0.06 0.11 -0.32 0.38 0.70 0.58 0.95 0.02
## X20 20 40 0.13 0.11 0.13 0.13 0.10 -0.11 0.39 0.50 0.31 0.17 0.02
## X21 21 40 0.09 0.15 0.09 0.08 0.12 -0.19 0.51 0.70 0.39 0.62 0.02
## X22 22 40 1.00 0.00 1.00 1.00 0.00 1.00 1.00 0.00 -2.14 1.85 0.00
## X23 23 40 0.53 0.11 0.56 0.53 0.11 0.29 0.72 0.44 -0.35 -1.01 0.02
## X24 24 40 0.59 0.11 0.59 0.60 0.11 0.29 0.77 0.47 -0.41 -0.23 0.02
## X25 25 40 0.00 0.12 0.00 0.00 0.11 -0.29 0.32 0.61 -0.08 0.29 0.02
## X26 26 40 0.01 0.17 0.01 0.01 0.17 -0.30 0.35 0.66 -0.04 -0.86 0.03
## X27 27 40 -0.08 0.14 -0.07 -0.08 0.15 -0.32 0.28 0.60 0.12 -0.49 0.02
## X28 28 40 0.53 0.11 0.56 0.53 0.11 0.29 0.72 0.44 -0.35 -1.01 0.02
## X29 29 40 1.00 0.00 1.00 1.00 0.00 1.00 1.00 0.00 -2.45 3.46 0.00
## X30 30 40 0.63 0.09 0.64 0.64 0.10 0.40 0.78 0.38 -0.65 -0.12 0.01
## X31 31 40 -0.15 0.13 -0.18 -0.15 0.12 -0.39 0.09 0.48 0.17 -0.91 0.02
## X32 32 40 0.02 0.17 0.01 0.02 0.16 -0.38 0.31 0.69 -0.14 -0.65 0.03
## X33 33 40 -0.05 0.17 -0.07 -0.05 0.16 -0.37 0.40 0.77 0.30 -0.24 0.03
## X34 34 40 0.59 0.11 0.59 0.60 0.11 0.29 0.77 0.47 -0.41 -0.23 0.02
## X35 35 40 0.63 0.09 0.64 0.64 0.10 0.40 0.78 0.38 -0.65 -0.12 0.01
## X36 36 40 1.00 0.00 1.00 1.00 0.00 1.00 1.00 0.00 -3.23 8.88 0.00
Because the correlations are symmetric, and the diagonal is always =1, we can pick just the lower diagonal elements
Here is a somewhat better new function
newer.boot <- function(data=NULL,sample.size=20, n.iter=1000, boot=TRUE, use="pairwise") {
nsub <- nrow(data) #this figures out the sampe size dynamically
result <- list() #we will save the data as a list
#use a for loop to repeat the code inside the { }
for(i in 1:n.iter) { #repeat some code
samp <- sample(nsub,sample.size,replace=boot) #sample with replacement or not
R <- cor(data[samp,],use=use)
R.lower <- R[lower.tri(R)] #just grab the lower diagonal part
result[[i]] <- R.lower
#find the correlation for this sample and save it
} #end of the loop
return(result) #return the value we find
} #end of function
newer.boot produces less ouput
sat.act.boot.1 <- newer.boot(sat.act,n.iter=50,sample.size=40)
length(sat.act.boot.1) #still length 50 (for number of iterations)
## [1] 50
sat.act.boot.1[[1]] #just 15 elements
## [1] 0.37730452 0.24912010 0.10974620 0.24204175 -0.00805468 0.68403606 0.06154105 -0.01011737
## [9] -0.04830387 0.10782936 0.07179507 0.08279858 0.38814038 0.53897643 0.54544016
temp <- matrix(unlist(sat.act.boot.1), ncol=15, byrow=TRUE)
describe(temp)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 50 0.12 0.14 0.09 0.12 0.15 -0.30 0.42 0.72 -0.15 -0.07 0.02
## X2 2 50 0.00 0.18 0.00 0.00 0.20 -0.36 0.36 0.72 0.03 -0.68 0.03
## X3 3 50 -0.06 0.14 -0.05 -0.06 0.15 -0.39 0.25 0.64 0.09 -0.51 0.02
## X4 4 50 -0.04 0.17 -0.03 -0.04 0.17 -0.49 0.30 0.79 -0.28 -0.32 0.02
## X5 5 50 -0.19 0.15 -0.17 -0.18 0.16 -0.56 0.07 0.63 -0.44 -0.48 0.02
## X6 6 50 0.57 0.10 0.58 0.58 0.11 0.36 0.74 0.37 -0.31 -0.88 0.01
## X7 7 50 0.15 0.17 0.13 0.15 0.15 -0.25 0.56 0.81 0.10 -0.13 0.02
## X8 8 50 0.02 0.18 0.04 0.03 0.16 -0.50 0.36 0.86 -0.30 0.33 0.03
## X9 9 50 0.01 0.17 0.03 0.01 0.17 -0.35 0.40 0.74 0.04 -0.61 0.02
## X10 10 50 0.12 0.15 0.12 0.12 0.14 -0.19 0.48 0.68 0.29 -0.16 0.02
## X11 11 50 -0.07 0.18 -0.06 -0.07 0.16 -0.61 0.39 1.00 -0.27 0.76 0.02
## X12 12 50 -0.03 0.16 -0.02 -0.02 0.19 -0.39 0.34 0.74 -0.25 -0.45 0.02
## X13 13 50 0.57 0.15 0.59 0.57 0.20 0.20 0.83 0.63 -0.28 -0.93 0.02
## X14 14 50 0.59 0.11 0.59 0.59 0.15 0.37 0.75 0.38 -0.24 -1.27 0.02
## X15 15 50 0.63 0.10 0.63 0.64 0.10 0.37 0.81 0.44 -0.36 -0.25 0.01
newer.boot returns a list which we need to process. Lets do it inside the function
newer.boot <- function(data=NULL,sample.size=20, n.iter=1000, boot=TRUE, use="pairwise") {
nsub <- nrow(data) #this figures out the sampe size dynamically
result <- list() #we will save the data as a list
#use a for loop to repeat the code inside the { }
for(i in 1:n.iter) { #repeat some code
samp <- sample(nsub,sample.size,replace=boot) #sample with replacement or not
R <- cor(data[samp,],use=use)
R.lower <- R[lower.tri(R)] #just grab the lower diagonal part
result[[i]] <- R.lower
#find the correlation for this sample and save it
} #end of the loop
nc <- length(R.lower) #figure out how long each list item is
new.result <- matrix(unlist(result),ncol=nc, byrow=TRUE) #do this inside the function
return(new.result) #return the value we find
} #end of function
boot.20 <- newer.boot(galton,sample.size=20)
boot.40 <- newer.boot(galton,sample.size=40)
boot.80 <- newer.boot(galton,sample.size=80)
boot.320 <- newer.boot(galton,sample.size=320)
boot.640 <- newer.boot(galton,sample.size=640)
boot.df <- data.frame(boot.20,boot.40,boot.80,boot.320,boot.640)
describe(boot.df)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## boot.20 1 1000 0.46 0.19 0.47 0.47 0.19 -0.16 0.89 1.05 -0.46 -0.05 0.01
## boot.40 2 1000 0.45 0.13 0.46 0.45 0.14 -0.09 0.79 0.88 -0.41 0.09 0.00
## boot.80 3 1000 0.45 0.09 0.46 0.46 0.09 0.08 0.71 0.63 -0.39 0.33 0.00
## boot.320 4 1000 0.46 0.05 0.46 0.46 0.05 0.27 0.59 0.32 -0.21 0.09 0.00
## boot.640 5 1000 0.46 0.03 0.46 0.46 0.03 0.36 0.58 0.22 0.01 0.05 0.00
The output of newer.boot is a matrix where the columns are the
various pairs of variables and the rows are the replications. Adapting
code from the corCi
function, we can correctly name the
columns.
newer.boot <- function(data=NULL,sample.size=50, n.iter=1000, boot=TRUE, use="pairwise", minlength=5) {
nsub <- nrow(data) #this figures out the sampe size dynamically
result <- list() #we will save the data as a list
#use a for loop to repeat the code insidetra the { }
for(i in 1:n.iter) { #repeat some code
samp <- sample(nsub,sample.size,replace=boot) #sample with replacement or not
R <- cor(data[samp,],use=use)
R.lower <- R[lower.tri(R)] #just grab the lower diagonal part
result[[i]] <- R.lower
#find the correlation for this sample and save it
} #end of the loop
nc <- length(R.lower) #figure out how long each list item is
new.result <- matrix(unlist(result),ncol=nc, byrow=TRUE)
#now name the columns using two loops
nvar <- NCOL(data)
cnR <- abbreviate(colnames(data), minlength = minlength)
colnames(new.result)<- paste0("V",1:NCOL(new.result)) #this is a filler
k <- 1
for (i in 1:(nvar - 1)) {
for (j in (i + 1):nvar) {
colnames(new.result)[k] <- paste(cnR[i], cnR[j],
sep = "-")
k <- k + 1
}
}
return(new.result) #return the value we find
} #end of function
##now try it
describe(newer.boot(sat.act))
## vars n mean sd median trimmed mad min max range skew kurtosis se
## gendr-edctn 1 1000 0.09 0.14 0.09 0.09 0.15 -0.39 0.54 0.93 -0.01 -0.08 0
## gendr-age 2 1000 -0.02 0.14 -0.02 -0.02 0.14 -0.44 0.37 0.81 -0.09 -0.26 0
## gendr-ACT 3 1000 -0.04 0.14 -0.03 -0.04 0.15 -0.54 0.37 0.92 -0.02 -0.19 0
## gendr-SATV 4 1000 -0.02 0.15 -0.02 -0.02 0.15 -0.47 0.41 0.88 0.00 -0.22 0
## gendr-SATQ 5 1000 -0.17 0.14 -0.17 -0.17 0.14 -0.57 0.34 0.92 0.20 -0.17 0
## edctn-age 6 1000 0.56 0.09 0.57 0.56 0.09 0.15 0.74 0.60 -0.74 0.44 0
## edctn-ACT 7 1000 0.16 0.14 0.16 0.16 0.14 -0.34 0.56 0.90 -0.09 0.02 0
## edctn-SATV 8 1000 0.06 0.14 0.06 0.05 0.14 -0.37 0.51 0.88 0.06 -0.20 0
## edctn-SATQ 9 1000 0.04 0.14 0.04 0.04 0.15 -0.45 0.47 0.92 -0.04 0.02 0
## age-ACT 10 1000 0.12 0.14 0.12 0.12 0.13 -0.40 0.54 0.94 -0.19 0.04 0
## age-SATV 11 1000 -0.03 0.14 -0.03 -0.03 0.14 -0.55 0.38 0.93 -0.20 0.03 0
## age-SATQ 12 1000 -0.03 0.15 -0.03 -0.03 0.15 -0.55 0.38 0.93 -0.17 -0.01 0
## ACT-SATV 13 1000 0.56 0.13 0.57 0.56 0.13 0.06 0.85 0.79 -0.45 -0.13 0
## ACT-SATQ 14 1000 0.59 0.12 0.61 0.59 0.12 0.18 0.83 0.65 -0.59 -0.04 0
## SATV-SATQ 15 1000 0.64 0.11 0.65 0.64 0.11 0.24 0.90 0.67 -0.57 0.29 0
describe(newer.boot(galton,160))
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 1000 0.46 0.07 0.46 0.46 0.06 0.2 0.65 0.45 -0.11 0.17 0
Functions are sometimes hard to undertand. If you say debug(somefunction) and then run that function, it will step through each line, one at a time. Hitting return gets to the next line, c goes to the end of the loop.
Try this on our newer.boot function.