350:week 2 correlation

Correlation using the Galton data set

Make the ‘psych’ and ‘psychTools’ packages active

library(psych)
library(psychTools)
sessionInfo()

## R version 4.3.3 (2024-02-29)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Sonoma 14.2.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Chicago
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] psychTools_2.4.3 psych_2.4.3     
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.31     R6_2.5.1          fastmap_1.1.1     xfun_0.41         lattice_0.22-5   
##  [6] cachem_1.0.7      parallel_4.3.3    knitr_1.45        foreign_0.8-86    htmltools_0.5.5  
## [11] rmarkdown_2.21    cli_3.6.1         R.methodsS3_1.8.2 grid_4.3.3        sass_0.4.5       
## [16] jquerylib_0.1.4   mnormt_2.1.1      compiler_4.3.3    R.oo_1.25.0       rstudioapi_0.14  
## [21] tools_4.3.3       rtf_0.4-14.1      nlme_3.1-164      evaluate_0.20     bslib_0.4.2      
## [26] yaml_2.3.7        rlang_1.1.0       jsonlite_1.8.4

Get the data

We can get the ‘Galton’ data set by just calling it by name. It is a built in data set.

It is important to find the dimensions of the data and perhaps to describe the data.

dim(galton)  #what are the dimensions

## [1] 928   2

names(galton) #what are the variable names

## [1] "parent" "child"

describe(galton)  #basic descriptives

##        vars   n  mean   sd median trimmed  mad  min  max range  skew kurtosis   se
## parent    1 928 68.31 1.79   68.5   68.32 1.48 64.0 73.0     9 -0.04     0.05 0.06
## child     2 928 68.09 2.52   68.2   68.12 2.97 61.7 73.7    12 -0.09    -0.35 0.08

Tabulate the data

First form the table using ‘table’, then sort it using the ‘order’ function

galton.tab <- table(galton)
galton.tab   #this table is ordered from short parents to tall parents

##       child
## parent 61.7 62.2 63.2 64.2 65.2 66.2 67.2 68.2 69.2 70.2 71.2 72.2 73.2 73.7
##   64      1    0    2    4    1    2    2    1    1    0    0    0    0    0
##   64.5    1    1    4    4    1    5    5    0    2    0    0    0    0    0
##   65.5    1    0    9    5    7   11   11    7    7    5    2    1    0    0
##   66.5    0    3    3    5    2   17   17   14   13    4    0    0    0    0
##   67.5    0    3    5   14   15   36   38   28   38   19   11    4    0    0
##   68.5    1    0    7   11   16   25   31   34   48   21   18    4    3    0
##   69.5    0    0    1   16    4   17   27   20   33   25   20   11    4    5
##   70.5    1    0    1    0    1    1    3   12   18   14    7    4    3    3
##   71.5    0    0    0    0    1    3    4    3    5   10    4    9    2    2
##   72.5    0    0    0    0    0    0    0    1    2    1    2    7    2    4
##   73      0    0    0    0    0    0    0    0    0    0    0    1    3    0

rownames(galton.tab)

##  [1] "64"   "64.5" "65.5" "66.5" "67.5" "68.5" "69.5" "70.5" "71.5" "72.5" "73"

rank(rownames(galton.tab)) #string commands together

##  [1]  1  2  3  4  5  6  7  8  9 10 11

ord <- order(rank(rownames(galton.tab)),decreasing=TRUE)
ord

##  [1] 11 10  9  8  7  6  5  4  3  2  1

galton.tab[ord,]  #this table is now orderd from tall parents to short parents

##       child
## parent 61.7 62.2 63.2 64.2 65.2 66.2 67.2 68.2 69.2 70.2 71.2 72.2 73.2 73.7
##   73      0    0    0    0    0    0    0    0    0    0    0    1    3    0
##   72.5    0    0    0    0    0    0    0    1    2    1    2    7    2    4
##   71.5    0    0    0    0    1    3    4    3    5   10    4    9    2    2
##   70.5    1    0    1    0    1    1    3   12   18   14    7    4    3    3
##   69.5    0    0    1   16    4   17   27   20   33   25   20   11    4    5
##   68.5    1    0    7   11   16   25   31   34   48   21   18    4    3    0
##   67.5    0    3    5   14   15   36   38   28   38   19   11    4    0    0
##   66.5    0    3    3    5    2   17   17   14   13    4    0    0    0    0
##   65.5    1    0    9    5    7   11   11    7    7    5    2    1    0    0
##   64.5    1    1    4    4    1    5    5    0    2    0    0    0    0    0
##   64      1    0    2    4    1    2    2    1    1    0    0    0    0    0

plot(galton)

That plot is not very helpful, because it does not show how many people are at each point.

Let use the ‘jitter’ command.

First set the random seed to a set value so the figures will agree

set.seed(42)   #cite Adams, 1979
plot(galton,pch=20,col="blue") #show the original data points
points(jitter(galton[,1]),jitter(galton[,2]))  #add a little jitter

#note we used 'points' to add to the plot

set.seed(42)   #cite Adams, 1979
plot(galton,pch=20,col="blue") #show the original data points
points(jitter(galton[,1],2),jitter(galton[,2],2))  #add a little more jitter

set.seed(42)   #cite Adams, 1979
plot(galton,pch=20,col="blue") #show the original data points
points(jitter(galton[,1],5),jitter(galton[,2],5))  #add a little more jitter

We can also display the means and error bars

error.bars.by(child ~ parent,data=galton,eyes=FALSE,v.labels=63:73,main="Galton's Height data")

scatterHist(child ~ parent,data=galton ) #normal formula imput

scatterHist(jitter(galton$child,5) ,jitter(galton$parent,5),ylab="Child",xlab="Parent",main="Galton height data")  #but if the variables jittered, you need this alternative style

#Yet one more display – the ‘pairs.panels’ function

pairs.panels(galton)

#but "jiggle" aka 'jitter" the points
pairs.panels(galton,jiggle=TRUE)

create a function

               # default values may be specified
small <- function(data=NULL,sample.size=20, n.iter=1000)  {
nsub <- nrow(data)    #this figures out the sampe size dynamically
result <- rep(NA,n.iter) #create this vector
#use a for loop to repeat the code inside the { }
for(i in 1:n.iter) { #repeat some code
   samp <- sample(nsub,sample.size,replace=TRUE) #boot strap resampling
    result[i] <- cor(data[samp,])[1,2]
         #find the correlation for this sample and save it
                      }    #end of the loop
return(result)   #return the value we find
}   #end of function

Use this function to generate some data

test <- small(galton) #this uses the default valuex describe(test)
hist(test,breaks=21) #draw a histogram of the results

## now do it for a bunch of cases

samp20 <- small(galton,20)
samp40 <- small(galton,40)
samp80 <- small(galton,80)
samp160 <- small(galton,160)
samp320 <- small(galton,320)
samp640<- small(galton,640)
sample.df <- data.frame(samp20 ,samp40, samp80, samp160,
           samp320,samp640)
describe(sample.df)

##         vars    n mean   sd median trimmed  mad   min  max range  skew kurtosis   se
## samp20     1 1000 0.44 0.20   0.47    0.45 0.19 -0.50 0.89  1.39 -0.64     0.54 0.01
## samp40     2 1000 0.46 0.13   0.47    0.46 0.13 -0.02 0.80  0.82 -0.46     0.28 0.00
## samp80     3 1000 0.45 0.09   0.46    0.46 0.09  0.12 0.72  0.60 -0.39     0.11 0.00
## samp160    4 1000 0.46 0.07   0.46    0.46 0.07  0.23 0.65  0.42 -0.11    -0.12 0.00
## samp320    5 1000 0.46 0.04   0.46    0.46 0.04  0.30 0.60  0.30 -0.11     0.00 0.00
## samp640    6 1000 0.46 0.03   0.46    0.46 0.03  0.37 0.57  0.20 -0.01    -0.07 0.00

Now, we try showing these results

We show them several different ways and slowly make the figure better.

violin(sample.df)

Add in error bars to the violin plot

violin(sample.df)
error.bars(sample.df,add=TRUE)

Just show the error bars. Note that the current version of `psych’ just shows 3 colors by default. This has been fixed in the most recent version. (Comimg soon)

error.bars(sample.df) #this only has 3 colors

error.bars(sample.df, col=rainbow(6)) #six colors

#combine the violin and error bars
violin(sample.df, main="Density of boot strap resamples from Galton")
error.bars(sample.df, col=rainbow(ncol(sample.df)),add=TRUE) #six colors