See the accompanying slides at https://personality-project.org/courses/350/350.manipulating.data.pdf and https://personality-project.org/courses/350/350.functions.pdf
The next lines (install.packages) is commented out, but you need to do it at least once. The
#install.packages("psych",repos="http://personality-project.org/r", type="source")
#install.packages("psychTools",repos="http://personality-project.org/r", type="source")
library(psych) #make it active
library(psychTools)
sessionInfo() #psych should be 2.3.5 and psychTools should be 2.3.5
## R Under development (unstable) (2023-03-17 r83997)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.3.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
##
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Chicago
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] psychTools_2.3.5 psych_2.3.5
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.31 R6_2.5.1 fastmap_1.1.1 xfun_0.38 lattice_0.20-45 cachem_1.0.7
## [7] parallel_4.3.0 knitr_1.42 foreign_0.8-84 htmltools_0.5.5 rmarkdown_2.21 cli_3.6.1
## [13] grid_4.3.0 sass_0.4.5 jquerylib_0.1.4 mnormt_2.1.1 compiler_4.3.0 rstudioapi_0.14
## [19] tools_4.3.0 nlme_3.1-162 evaluate_0.20 bslib_0.4.2 yaml_2.3.7 rlang_1.1.0
## [25] jsonlite_1.8.4
vJoin
and recode
x <- matrix(1:30, ncol=5)
y <- matrix(1:40, ncol=8)
x
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 7 13 19 25
## [2,] 2 8 14 20 26
## [3,] 3 9 15 21 27
## [4,] 4 10 16 22 28
## [5,] 5 11 17 23 29
## [6,] 6 12 18 24 30
y
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 1 6 11 16 21 26 31 36
## [2,] 2 7 12 17 22 27 32 37
## [3,] 3 8 13 18 23 28 33 38
## [4,] 4 9 14 19 24 29 34 39
## [5,] 5 10 15 20 25 30 35 40
xy <- vJoin(x,y)
xy
## x1 x2 x3 x4 x5 y1 y2 y3 y4 y5 y6 y7 y8
## Sx1 1 7 13 19 25 NA NA NA NA NA NA NA NA
## Sx2 2 8 14 20 26 NA NA NA NA NA NA NA NA
## Sx3 3 9 15 21 27 NA NA NA NA NA NA NA NA
## Sx4 4 10 16 22 28 NA NA NA NA NA NA NA NA
## Sx5 5 11 17 23 29 NA NA NA NA NA NA NA NA
## Sx6 6 12 18 24 30 NA NA NA NA NA NA NA NA
## Sy1 NA NA NA NA NA 1 6 11 16 21 26 31 36
## Sy2 NA NA NA NA NA 2 7 12 17 22 27 32 37
## Sy3 NA NA NA NA NA 3 8 13 18 23 28 33 38
## Sy4 NA NA NA NA NA 4 9 14 19 24 29 34 39
## Sy5 NA NA NA NA NA 5 10 15 20 25 30 35 40
headTail(xy, top=2, bottom=2)
## x1 x2 x3 x4 x5 y1 y2 y3 y4 y5 y6 y7 y8
## Sx1 1 7 13 19 25 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## Sx2 2 8 14 20 26 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## ... ... ... ... ... ... ... ... ... ... ... ... ... ...
## Sy4 <NA> <NA> <NA> <NA> <NA> 4 9 14 19 24 29 34 39
## Sy5 <NA> <NA> <NA> <NA> <NA> 5 10 15 20 25 30 35 40
XY <- vJoin(x,y, cnames=FALSE)
XY
## x1 x2 x3 x4 x5 x6 x7 x8
## Sx1 1 7 13 19 25 NA NA NA
## Sx2 2 8 14 20 26 NA NA NA
## Sx3 3 9 15 21 27 NA NA NA
## Sx4 4 10 16 22 28 NA NA NA
## Sx5 5 11 17 23 29 NA NA NA
## Sx6 6 12 18 24 30 NA NA NA
## Sy1 1 6 11 16 21 26 31 36
## Sy2 2 7 12 17 22 27 32 37
## Sy3 3 8 13 18 23 28 33 38
## Sy4 4 9 14 19 24 29 34 39
## Sy5 5 10 15 20 25 30 35 40
vJoin
?vJoin #to see the help file
vJoin #to see the code
## function (x, y, rnames = TRUE, cnames = TRUE)
## {
## n.x <- NCOL(x)
## n.y <- NCOL(y)
## n.obsx <- NROW(x)
## n.obsy <- NROW(y)
## if (is.null(colnames(x)))
## colnames(x) <- paste0("x", 1:n.x)
## if (is.null(colnames(y)))
## if (cnames) {
## colnames(y) <- paste0("y", 1:n.y)
## }
## else {
## colnames(y) <- paste0("x", 1:n.y)
## }
## if (is.null(rownames(x)))
## rownames(x) <- paste0("Sx", 1:n.obsx)
## if (is.null(rownames(y)))
## rownames(y) <- paste0("Sy", 1:n.obsy)
## xy <- unique(c(colnames(x), colnames(y)))
## n.xy.col <- length(xy)
## if (!rnames) {
## new.names <- n.obsx + 1:n.obsy
## rownames(y) <- new.names
## }
## xy.rows <- unique(c(rownames(x), rownames(y)))
## n.xy.rows <- length(xy.rows)
## new <- data.frame(matrix(NA, ncol = n.xy.col, nrow = n.xy.rows))
## colnames(new) <- xy
## rownames(new) <- xy.rows
## new[rownames(x), colnames(x)] <- x[rownames(x), colnames(x)]
## new[rownames(y), colnames(y)] <- y[rownames(y), colnames(y)]
## return(new)
## }
## <bytecode: 0x13a35e068>
## <environment: namespace:psychTools>
temp <- bfi[1:10,1:6] #a small subset of bfi
temp2 <- bfi[5:15,4:8] #an overlapping subset
temp12 <- vJoin(temp,temp2)
temp12
## A1 A2 A3 A4 A5 C1 C2 C3
## 61617 2 4 3 4 4 2 NA NA
## 61618 2 4 5 2 5 5 NA NA
## 61620 5 4 5 4 4 4 NA NA
## 61621 4 4 6 5 5 4 NA NA
## 61622 2 3 3 4 5 4 4 5
## 61623 6 6 5 6 5 6 6 6
## 61624 2 5 5 3 5 5 4 4
## 61629 4 3 1 5 1 3 2 4
## 61630 4 3 6 3 3 6 6 3
## 61633 2 5 6 6 5 6 5 6
## 61634 NA NA NA 6 5 4 3 5
## 61636 NA NA NA 5 5 5 4 5
## 61637 NA NA NA 6 4 5 4 3
## 61639 NA NA NA 6 6 4 4 4
## 61640 NA NA NA 2 1 5 5 5
If some data are character with levels that do not make sense in terms of some ordinal variable, we can recode them.
First we create a file with levels of education that we think make sense
education <- cs(high_school, some_college, college, in_graduate_school, graduate_degree)
temp <- recode(bfi,where=27, isvalue=1:5,newvalue=education)
bfi[50:70,27] #show the education levels
## [1] 3 3 4 4 3 4 5 3 5 2 5 1 1 4 3 5 3 4 3 5 2
temp[50:70,27] #they correspond
## [1] "college" "college" "in_graduate_school" "in_graduate_school" "college"
## [6] "in_graduate_school" "graduate_degree" "college" "graduate_degree" "some_college"
## [11] "graduate_degree" "high_school" "high_school" "in_graduate_school" "college"
## [16] "graduate_degree" "college" "in_graduate_school" "college" "graduate_degree"
## [21] "some_college"
table(bfi[27])
## education
## 1 2 3 4 5
## 224 292 1249 394 418
table(temp[27]) #but these are not ordered by the numeric level
## education
## college graduate_degree high_school in_graduate_school some_college
## 1249 418 224 394 292
#convert to numeric
describe(bfi[26:28])
## vars n mean sd median trimmed mad min max range skew kurtosis se
## gender 1 2800 1.67 0.47 2 1.71 0.00 1 2 1 -0.73 -1.47 0.01
## education 2 2577 3.19 1.11 3 3.22 1.48 1 5 4 -0.05 -0.32 0.02
## age 3 2800 28.78 11.13 26 27.43 10.38 3 86 83 1.02 0.56 0.21
describe(temp[26:28])
## vars n mean sd median trimmed mad min max range skew kurtosis se
## gender 1 2800 1.67 0.47 2 1.71 0.00 1 2 1 -0.73 -1.47 0.01
## education* 2 2577 2.25 1.46 2 2.06 1.48 1 5 4 0.73 -1.01 0.03
## age 3 2800 28.78 11.13 26 27.43 10.38 3 86 83 1.02 0.56 0.21
temp2 <- char2numeric(temp)
describe(temp2[26:28])
## vars n mean sd median trimmed mad min max range skew kurtosis se
## gender 1 2800 1.67 0.47 2 1.71 0.00 1 2 1 -0.73 -1.47 0.01
## education* 2 2577 2.25 1.46 2 2.06 1.48 1 5 4 0.73 -1.01 0.03
## age 3 2800 28.78 11.13 26 27.43 10.38 3 86 83 1.02 0.56 0.21
How can recode these to make sense?
new.education <- c(3,5,1,4,2)
temp3 <- recode(temp2,27,isvalue=1:6, newvalue=new.education)
describe(temp3[26:28])
## vars n mean sd median trimmed mad min max range skew kurtosis se
## gender 1 2800 1.67 0.47 2 1.71 0.00 1 2 1 -0.73 -1.47 0.01
## education* 2 2577 3.19 1.11 3 3.22 1.48 1 5 4 -0.05 -0.32 0.02
## age 3 2800 28.78 11.13 26 27.43 10.38 3 86 83 1.02 0.56 0.21
Three basic approaches to using R
Basic step through your operations, use R
like a desk
calculator Try a command (calling some functions from a package),
Keep a list of your most useful set of lines Annotate them to remind you what you have done Use RMarkdown to annotate your scripts and to show your output
source
these functions when you do your work,.fn <- file.choose()
#dynamically open your directory
to find a file
file.choose(new=TRUE)
create a new file. Uses the normal
computer search functions, but the creates a new file. This might be
just available in regular R on a Mac, but not RStudio.
read.table(filename)
Read tabular input from the file
location .
read.csv(filename)
reads a comma separated file
read.delim(filename)
reads a tab delimited file
load(filename)
“loads” objects from an .Rds file
read.file(filename)
* combines and appropriate read
command (e.g., , , , etc.).
One way to learn coding is to read functions. Try to understand how they work.
As an example of reading files and using functions, lets look at the
read.file
function. Note that default values are specified
but could be changed.
There are a number of different ways to read files, these depend upon the specific type of file.
We look for many possible suffixes. Others can easily be added if we think we need to add them.
We use a function (file_ext
) from the
tools' package. This finds out what the suffix (.csv, .sav, .txt, ...) is and then is used in the
switch’
function.
read.file #lets look at what it does
## function (file = NULL, header = TRUE, use.value.labels = FALSE,
## to.data.frame = TRUE, sep = ",", quote = "\"", widths = NULL,
## f = NULL, filetype = NULL, ...)
## {
## if (missing(f) && missing(file))
## f <- file.choose()
## if (missing(f) && !missing(file))
## f <- file
## suffix <- file_ext(f)
## if (!missing(filetype))
## suffix <- filetype
## if (suffix %in% c("txt", "TXT", "text", "data", "dat", "DAT"))
## suffix <- "txt"
## if (suffix %in% c("R", "r"))
## suffix <- "R"
## if (suffix %in% c("rds", "RDS", "Rds"))
## suffix <- "rds"
## if (suffix %in% c("Rda", "rda", "Rdata", "RData", "rdata"))
## suffix <- "Rda"
## if (suffix %in% c("SYD", "syd", "sys", "SYS"))
## suffix <- "SYD"
## if (!missing(widths)) {
## result <- read.fwf(f, widths, ...)
## message("The fixed width file ", f, "has been loaded.")
## }
## else {
## switch(suffix, sav = {
## result <- read.spss(f, use.value.labels = use.value.labels,
## to.data.frame = to.data.frame)
## message("Data from the SPSS sav file ", f, " has been loaded.")
## }, csv = {
## result <- read.table(f, header = header, sep = sep,
## quote = quote, ...)
## message("Data from the .csv file ", f, " has been loaded.")
## }, tab = {
## result <- read.table(f, header = header, sep = "\t",
## ...)
## message("Data from the .tab file ", f, " has been loaded.")
## }, txt = {
## result <- read.table(f, header = header, ...)
## message("Data from the .txt file ", f, " has been loaded.")
## }, rds = {
## result <- try(readRDS(f, ...), silent = TRUE)
## if (inherits(result, "try-error")) {
## result <- f
## message("I had problems reading this file, \ntry load('",
## f, "') instead. \nCaution, this might replace an object currently in your environment.")
## } else {
## message("File ", f, " has been loaded.")
## }
## }, R = {
## result <- dget(f, ...)
## message("File ", f, " has been loaded.")
## }, XPT = {
## result <- read.xport(f, ...)
## message("File ", f, " has been loaded.")
## }, xpt = {
## result <- read.xport(f, ...)
## message("File ", f, " has been loaded.")
## }, Rda = {
## result <- f
## message("To load this file (or these files) you need to load('",
## f, "') \nCaution, this might replace an object currently in your environment.")
## }, SYD = {
## result <- read.systat(f, to.data.frame = to.data.frame)
## message("Data from the systat SYD file ", f, " has been loaded.")
## }, jmp = {
## result <- f
## message("I am sorrry. To read this .jmp file, it must first be saved as either a \"txt\" or \"csv\" file. If you insist on using SAS formats, try .xpt or .XPT")
## }, sas7bdat = {
## result <- f
## message("I am sorry. To read this .sas7bdat file, it must first be saved as either a xpt, or XPT file in SAS, or as a \"txt\" or \"csv\" file. ?read.ssd in foreign for help.")
## }, {
## message("I am sorry. \nI can not tell from the suffix what file type is this. Rather than try to read it, I will let you specify a better format.\n You might try specifying the filetype")
## })
## }
## return(result)
## }
## <bytecode: 0x10cf00668>
## <environment: namespace:psychTools>
read.file
can be used for local files or remote
locations.
file.choose
will search your directory for a file.
file_ext then examines that file name to identify what it is.
We do not run this function because it is dynamic and requires user input.
library(tools)
file.name <- "/Users/WR/Box Sync/pmc_folder/Courses.18/350/datasets/cars.rda"
file.name2 <- "/Users/WR/Box Sync/pmc_folder/Courses.18/350/datasets/glbwarm.sav"
file.name3 <- "/Users/WR/Box Sync/pmc_folder/Courses.18/350/datasets/simulation.txt"
suffix <- file_ext(file.name)
suffix2 <- file_ext(file.name2)
suffix3 <- file_ext(file.name3)
suffix; suffix2; suffix3 #we can string several lines together using ;
## [1] "rda"
## [1] "sav"
## [1] "txt"
# now read these files
cars <- read.file(file.name)
## To load this file (or these files) you need to load('/Users/WR/Box Sync/pmc_folder/Courses.18/350/datasets/cars.rda')
## Caution, this might replace an object currently in your environment.
dim(cars)
## NULL
#glb <- read.file(file.name2)
#dim(glb)
ls() #show the objects in my workspace
## [1] "cars" "education" "file.name" "file.name2" "file.name3" "new.education"
## [7] "suffix" "suffix2" "suffix3" "temp" "temp12" "temp2"
## [13] "temp3" "x" "xy" "XY" "y"
#rm(list=ls()) #CAREFUL, this will remove all the objects in your workspace!
dim(sat.act) #how many rows and columns in the sat.act object
## [1] 700 6
names(sat.act)
## [1] "gender" "education" "age" "ACT" "SATV" "SATQ"
colnames(sat.act)
## [1] "gender" "education" "age" "ACT" "SATV" "SATQ"
head(sat.act) #show the first 5 lines
## gender education age ACT SATV SATQ
## 29442 2 3 19 24 500 500
## 29457 2 3 23 35 600 500
## 29498 2 3 20 21 480 470
## 29503 1 4 27 26 550 520
## 29504 1 2 33 31 600 550
## 29518 1 5 26 28 640 640
tail(sat.act) #show the last lines
## gender education age ACT SATV SATQ
## 39904 2 3 20 26 710 680
## 39915 1 3 25 30 500 500
## 39937 1 4 40 27 613 630
## 39951 2 3 24 31 700 630
## 39961 1 4 35 32 700 780
## 39985 1 5 25 25 600 600
headTail(sat.act) #combine head and tail
## gender education age ACT SATV SATQ
## 29442 2 3 19 24 500 500
## 29457 2 3 23 35 600 500
## 29498 2 3 20 21 480 470
## 29503 1 4 27 26 550 520
## ... ... ... ... ... ... ...
## 39937 1 4 40 27 613 630
## 39951 2 3 24 31 700 630
## 39961 1 4 35 32 700 780
## 39985 1 5 25 25 600 600
mod <- lm(SATV ~ gender + age + education, data =sat.act)
length(mod) #how many elements are in mod?
## [1] 12
names(mod) #what are their names
## [1] "coefficients" "residuals" "effects" "rank" "fitted.values" "assign"
## [7] "qr" "df.residual" "xlevels" "call" "terms" "model"
str(mod) #what is the structure of mod
## List of 12
## $ coefficients : Named num [1:4] 628.26 -7.08 -1.19 8.23
## ..- attr(*, "names")= chr [1:4] "(Intercept)" "gender" "age" "education"
## $ residuals : Named num [1:700] -116.22 -11.47 -135.03 -72.02 1.56 ...
## ..- attr(*, "names")= chr [1:700] "29442" "29457" "29498" "29503" ...
## $ effects : Named num [1:700] -16198.2 56.25 -127.63 -257.53 -1.68 ...
## ..- attr(*, "names")= chr [1:700] "(Intercept)" "gender" "age" "education" ...
## $ rank : int 4
## $ fitted.values: Named num [1:700] 616 611 615 622 598 ...
## ..- attr(*, "names")= chr [1:700] "29442" "29457" "29498" "29503" ...
## $ assign : int [1:4] 0 1 2 3
## $ qr :List of 5
## ..$ qr : num [1:700, 1:4] -26.4575 0.0378 0.0378 0.0378 0.0378 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:700] "29442" "29457" "29498" "29503" ...
## .. .. ..$ : chr [1:4] "(Intercept)" "gender" "age" "education"
## .. ..- attr(*, "assign")= int [1:4] 0 1 2 3
## ..$ qraux: num [1:4] 1.04 1.03 1.02 1.03
## ..$ pivot: int [1:4] 1 2 3 4
## ..$ tol : num 1e-07
## ..$ rank : int 4
## ..- attr(*, "class")= chr "qr"
## $ df.residual : int 696
## $ xlevels : Named list()
## $ call : language lm(formula = SATV ~ gender + age + education, data = sat.act)
## $ terms :Classes 'terms', 'formula' language SATV ~ gender + age + education
## .. ..- attr(*, "variables")= language list(SATV, gender, age, education)
## .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:4] "SATV" "gender" "age" "education"
## .. .. .. ..$ : chr [1:3] "gender" "age" "education"
## .. ..- attr(*, "term.labels")= chr [1:3] "gender" "age" "education"
## .. ..- attr(*, "order")= int [1:3] 1 1 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(SATV, gender, age, education)
## .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "numeric" "numeric" "numeric"
## .. .. ..- attr(*, "names")= chr [1:4] "SATV" "gender" "age" "education"
## $ model :'data.frame': 700 obs. of 4 variables:
## ..$ SATV : int [1:700] 500 600 480 550 600 640 610 520 400 730 ...
## ..$ gender : int [1:700] 2 2 2 1 1 1 2 1 2 2 ...
## ..$ age : int [1:700] 19 23 20 27 33 26 30 19 23 40 ...
## ..$ education: int [1:700] 3 3 3 4 2 5 5 3 4 5 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language SATV ~ gender + age + education
## .. .. ..- attr(*, "variables")= language list(SATV, gender, age, education)
## .. .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:4] "SATV" "gender" "age" "education"
## .. .. .. .. ..$ : chr [1:3] "gender" "age" "education"
## .. .. ..- attr(*, "term.labels")= chr [1:3] "gender" "age" "education"
## .. .. ..- attr(*, "order")= int [1:3] 1 1 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(SATV, gender, age, education)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "numeric" "numeric" "numeric"
## .. .. .. ..- attr(*, "names")= chr [1:4] "SATV" "gender" "age" "education"
## - attr(*, "class")= chr "lm"
summary(mod) #summary returns summary statistics appropriate for the object
##
## Call:
## lm(formula = SATV ~ gender + age + education, data = sat.act)
##
## Residuals:
## Min 1Q Median 3Q Max
## -411.51 -67.70 12.16 82.62 207.28
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 628.263 19.399 32.386 <2e-16 ***
## gender -7.082 8.971 -0.789 0.4302
## age -1.188 0.538 -2.208 0.0276 *
## education 8.229 3.598 2.287 0.0225 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 112.6 on 696 degrees of freedom
## Multiple R-squared: 0.009626, Adjusted R-squared: 0.005357
## F-statistic: 2.255 on 3 and 696 DF, p-value: 0.08076
summary(sat.act) #but they are not very useful summaries for data
## gender education age ACT SATV SATQ
## Min. :1.000 Min. :0.000 Min. :13.00 Min. : 3.00 Min. :200.0 Min. :200.0
## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:19.00 1st Qu.:25.00 1st Qu.:550.0 1st Qu.:530.0
## Median :2.000 Median :3.000 Median :22.00 Median :29.00 Median :620.0 Median :620.0
## Mean :1.647 Mean :3.164 Mean :25.59 Mean :28.55 Mean :612.2 Mean :610.2
## 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:29.00 3rd Qu.:32.00 3rd Qu.:700.0 3rd Qu.:700.0
## Max. :2.000 Max. :5.000 Max. :65.00 Max. :36.00 Max. :800.0 Max. :800.0
## NA's :13
describe(sat.act) #more useful summary statistics
## vars n mean sd median trimmed mad min max range skew kurtosis se
## gender 1 700 1.65 0.48 2 1.68 0.00 1 2 1 -0.61 -1.62 0.02
## education 2 700 3.16 1.43 3 3.31 1.48 0 5 5 -0.68 -0.07 0.05
## age 3 700 25.59 9.50 22 23.86 5.93 13 65 52 1.64 2.42 0.36
## ACT 4 700 28.55 4.82 29 28.84 4.45 3 36 33 -0.66 0.53 0.18
## SATV 5 700 612.23 112.90 620 619.45 118.61 200 800 600 -0.64 0.33 4.27
## SATQ 6 687 610.22 115.64 620 617.25 118.61 200 800 600 -0.59 -0.02 4.41
table(sat.act$education) #break this down for one column
##
## 0 1 2 3 4 5
## 57 45 44 275 138 141
table(sat.act[c("education","gender")] ) #do the two way table
## gender
## education 1 2
## 0 27 30
## 1 20 25
## 2 23 21
## 3 80 195
## 4 51 87
## 5 46 95
t <- table(sat.act[c("education","gender")] ) #do it again, but save it as an object
chisq.test(t) #do the chi square on this table
##
## Pearson's Chi-squared test
##
## data: t
## X-squared = 16.085, df = 5, p-value = 0.006605
Two of the most useful data structures are matrices and data.frames. Matrices are all of one type (logical, character, numeric), while data frames can be mixtures of all data types.
Create a small matrix with the first 1,000 digits.
x <- matrix(1:1000,ncol=10)
y <- matrix(1:1000,nrow=100,byrow=TRUE) #specify the number of rows,
headTail(x)
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 1 101 201 301 401 501 601 701 801 901
## 2 2 102 202 302 402 502 602 702 802 902
## 3 3 103 203 303 403 503 603 703 803 903
## 4 4 104 204 304 404 504 604 704 804 904
## ... ... ... ... ... ... ... ... ... ... ...
## 97 97 197 297 397 497 597 697 797 897 997
## 98 98 198 298 398 498 598 698 798 898 998
## 99 99 199 299 399 499 599 699 799 899 999
## 100 100 200 300 400 500 600 700 800 900 1000
headTail(y) #note how the elements are in a different order if we specify byrow
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 1 2 3 4 5 6 7 8 9 10
## 2 11 12 13 14 15 16 17 18 19 20
## 3 21 22 23 24 25 26 27 28 29 30
## 4 31 32 33 34 35 36 37 38 39 40
## ... ... ... ... ... ... ... ... ... ... ...
## 97 961 962 963 964 965 966 967 968 969 970
## 98 971 972 973 974 975 976 977 978 979 980
## 99 981 982 983 984 985 986 987 988 989 990
## 100 991 992 993 994 995 996 997 998 999 1000
xy <- cbind(x,y) #column bind them
dim(xy) #what are the dimensions
## [1] 100 20
yx <- rbind(x,y) #row bind the data sets
dim(yx)
## [1] 200 10
headTail(xy)
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20
## 1 1 101 201 301 401 501 601 701 801 901 1 2 3 4 5 6 7 8 9 10
## 2 2 102 202 302 402 502 602 702 802 902 11 12 13 14 15 16 17 18 19 20
## 3 3 103 203 303 403 503 603 703 803 903 21 22 23 24 25 26 27 28 29 30
## 4 4 104 204 304 404 504 604 704 804 904 31 32 33 34 35 36 37 38 39 40
## ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
## 97 97 197 297 397 497 597 697 797 897 997 961 962 963 964 965 966 967 968 969 970
## 98 98 198 298 398 498 598 698 798 898 998 971 972 973 974 975 976 977 978 979 980
## 99 99 199 299 399 499 599 699 799 899 999 981 982 983 984 985 986 987 988 989 990
## 100 100 200 300 400 500 600 700 800 900 1000 991 992 993 994 995 996 997 998 999 1000
headTail(yx)
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 1 101 201 301 401 501 601 701 801 901
## 2 2 102 202 302 402 502 602 702 802 902
## 3 3 103 203 303 403 503 603 703 803 903
## 4 4 104 204 304 404 504 604 704 804 904
## ... ... ... ... ... ... ... ... ... ... ...
## 197 961 962 963 964 965 966 967 968 969 970
## 198 971 972 973 974 975 976 977 978 979 980
## 199 981 982 983 984 985 986 987 988 989 990
## 200 991 992 993 994 995 996 997 998 999 1000
Matrices can be turned into data frames if we want
x.df <- data.frame(x)
headTail(x.df)
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 1 101 201 301 401 501 601 701 801 901
## 2 2 102 202 302 402 502 602 702 802 902
## 3 3 103 203 303 403 503 603 703 803 903
## 4 4 104 204 304 404 504 604 704 804 904
## ... ... ... ... ... ... ... ... ... ... ...
## 97 97 197 297 397 497 597 697 797 897 997
## 98 98 198 298 398 498 598 698 798 898 998
## 99 99 199 299 399 499 599 699 799 899 999
## 100 100 200 300 400 500 600 700 800 900 1000
But we can do things to dataframes we can not do to matrices
condition <- gl(2, 8, labels = c("Control", "Treat"))
x.df <- data.frame(condition,results=rnorm(16))
x.df
## condition results
## 1 Control 0.21436178
## 2 Control 2.40046205
## 3 Control -0.53549466
## 4 Control -0.34526300
## 5 Control 0.76010132
## 6 Control 1.13454875
## 7 Control -0.35354696
## 8 Control -2.02310623
## 9 Treat -0.70711836
## 10 Treat -0.22865065
## 11 Treat -2.36506374
## 12 Treat 0.76960056
## 13 Treat 0.48947885
## 14 Treat -0.52356413
## 15 Treat -1.22151863
## 16 Treat 0.04299917
describe(x.df)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## condition* 1 16 1.50 0.52 1.50 1.50 0.74 1.00 2.0 1.00 0.00 -2.12 0.13
## results 2 16 -0.16 1.17 -0.29 -0.18 0.95 -2.37 2.4 4.77 0.08 -0.21 0.29
nrow(x.df) #how many rows
## [1] 16
ncol(x.df) #how many columns
## [1] 2
mean(x)
## [1] 500.5
mean(y)
## [1] 500.5
colMeans(x)
## [1] 50.5 150.5 250.5 350.5 450.5 550.5 650.5 750.5 850.5 950.5
colMeans(y)
## [1] 496 497 498 499 500 501 502 503 504 505
#use the apply function, applied to the columns of the x
cm <- apply(x,2,mean) #same as column mean
cm
## [1] 50.5 150.5 250.5 350.5 450.5 550.5 650.5 750.5 850.5 950.5
ssd <- apply(x,2,sd) #column standard deviations
round(ssd,digits=2)
## [1] 29.01 29.01 29.01 29.01 29.01 29.01 29.01 29.01 29.01 29.01
smed <- apply(x,2,median)
smed #show the column medians
## [1] 50.5 150.5 250.5 350.5 450.5 550.5 650.5 750.5 850.5 950.5
What is the structure of the result of an apply operation?
x <- matrix(1:1000,ncol=10)
"dumbfunction" <- function(x) { #this one gives bad results? why?
nvar <- ncol(x)
nsub <- nrow(x)
mean <- rep(NA,nvar)
sum <- 0
for(j in 1:nvar) {
for (i in 1:nsub) {
sum <- sum+ x[i,j]
}
mean[j] <- sum/nsub
}
print(mean)
}
dumbfunction(x) #why are these results non-sensical?
## [1] 50.5 201.0 451.5 802.0 1252.5 1803.0 2453.5 3204.0 4054.5 5005.0
The previous results don’t make any sense. For how can the means exceed the values. We need to move where we zero out the counting.
"notquitsodumb" <- function(x) { #this one gives better results? why?
nvar <- ncol(x)
nsub <- nrow(x)
mean <- rep(NA,nvar)
for(j in 1:nvar) {
sum <- 0 #we set the counter to 0 for each variable
for (i in 1:nsub) {
sum <- sum+ x[i,j]
}
mean[j] <- sum/nsub
}
print(mean)
}
notquitsodumb (x)
## [1] 50.5 150.5 250.5 350.5 450.5 550.5 650.5 750.5 850.5 950.5
We want to compare the cross validities of using unit weighted scales (that is to say, just add them up) versus factor weighted scales (requires factor analysis and then uses the weights from the factor loadings to find the scores.) We are interested in what happens if we apply the same weights to a different sample.
We do this 100 times!
the data set has 4,000 observations with 10 criteria and 135 personality items
The Neuroticism scale of 14 items can be used as an example
For each of 100 random samples of size 200 from the 4000,
Form unit weighted scores
Form factor scores (based upon a one dimensional factor analysis)
Find the validities of these scores for the 10 criteria
Cross validate on the other 3800 subjects
Pool these 100 samples
Graph the results
#use the spi data set and do it for neurocitsm
N.items <- selectFromKeys(spi.keys["Neuro"])
spi.N <- spi[N.items]
N.keys <- spi.keys["Neuro"]
#spi.ext.crit <- data.frame(spi[,1:10],spi.E)
n.obs <- 200
results <- list()
for(i in 1:100) {
ss <- sample (4000,n.obs,replace=FALSE)
f.ss <- fa(spi.N [ss,]) #find a one factor solution and score it
scores.ss <- data.frame(unit=scoreFast(N.keys,spi.N[ss,]),factor=f.ss$scores)
validities <- cor(scores.ss,spi[ss,1:10],use="pairwise")
cross <- data.frame(unit.cross=scoreFast(N.keys,spi.N[-ss,]),factor.cross = predict(f.ss,spi.N[-ss,]) )
cross.validities <- cor(cross,spi[-ss,1:10],use="pairwise" )
results[[i]]<-c(validities,cross.validities)
}
summary <- matrix(unlist(results),ncol=40,byrow=TRUE)
colnames(summary) <- c(paste0("unit",colnames(spi)[1:10]) , paste0("factor",colnames(spi)[1:10]),paste0("unit.cross",colnames(spi)[1:10]), paste0("factor.cross",colnames(spi)[1:10]))
describe(summary) #for textual output
## vars n mean sd median trimmed mad min max range skew kurtosis se
## unitage 1 100 -0.17 0.08 -0.16 -0.17 0.07 -0.35 0.01 0.36 -0.05 -0.36 0.01
## unitsex 2 100 -0.17 0.08 -0.17 -0.18 0.07 -0.36 0.00 0.36 0.01 -0.34 0.01
## unithealth 3 100 0.24 0.07 0.25 0.25 0.07 0.03 0.40 0.38 -0.39 0.10 0.01
## unitp1edu 4 100 0.25 0.07 0.26 0.26 0.07 0.03 0.40 0.37 -0.36 0.15 0.01
## unitp2edu 5 100 -0.34 0.07 -0.34 -0.34 0.06 -0.51 -0.18 0.34 0.04 -0.03 0.01
## uniteducation 6 100 -0.34 0.07 -0.33 -0.34 0.06 -0.51 -0.18 0.33 -0.04 -0.02 0.01
## unitwellness 7 100 -0.05 0.08 -0.04 -0.05 0.07 -0.27 0.11 0.37 -0.35 -0.21 0.01
## unitexer 8 100 -0.05 0.08 -0.04 -0.05 0.08 -0.27 0.10 0.38 -0.32 -0.32 0.01
## unitsmoke 9 100 -0.04 0.08 -0.03 -0.04 0.08 -0.25 0.12 0.37 -0.39 -0.37 0.01
## unitER 10 100 -0.04 0.08 -0.03 -0.04 0.08 -0.25 0.11 0.36 -0.43 -0.29 0.01
## factorage 11 100 -0.18 0.08 -0.18 -0.18 0.09 -0.40 0.11 0.51 0.18 0.24 0.01
## factorsex 12 100 -0.17 0.08 -0.17 -0.17 0.10 -0.38 0.11 0.49 0.17 0.16 0.01
## factorhealth 13 100 -0.02 0.07 -0.02 -0.02 0.09 -0.22 0.11 0.33 -0.30 -0.58 0.01
## factorp1edu 14 100 -0.02 0.07 -0.02 -0.02 0.08 -0.22 0.11 0.33 -0.22 -0.58 0.01
## factorp2edu 15 100 -0.17 0.07 -0.17 -0.17 0.08 -0.37 0.00 0.37 -0.23 -0.42 0.01
## factoreducation 16 100 -0.18 0.07 -0.17 -0.17 0.07 -0.36 -0.01 0.35 -0.27 -0.39 0.01
## factorwellness 17 100 0.06 0.07 0.06 0.06 0.06 -0.10 0.21 0.31 -0.16 -0.50 0.01
## factorexer 18 100 0.05 0.07 0.05 0.05 0.06 -0.10 0.20 0.30 -0.17 -0.43 0.01
## factorsmoke 19 100 0.12 0.08 0.12 0.13 0.08 -0.07 0.34 0.41 -0.02 -0.28 0.01
## factorER 20 100 0.12 0.08 0.13 0.12 0.08 -0.07 0.32 0.39 -0.08 -0.28 0.01
## unit.crossage 21 100 -0.17 0.00 -0.17 -0.17 0.00 -0.18 -0.16 0.02 0.15 -0.40 0.00
## unit.crosssex 22 100 -0.18 0.00 -0.18 -0.18 0.00 -0.19 -0.17 0.03 0.35 0.03 0.00
## unit.crosshealth 23 100 0.24 0.00 0.24 0.24 0.00 0.24 0.26 0.02 0.42 0.18 0.00
## unit.crossp1edu 24 100 0.25 0.01 0.25 0.25 0.01 0.24 0.27 0.03 0.29 -0.33 0.00
## unit.crossp2edu 25 100 -0.34 0.00 -0.34 -0.34 0.00 -0.35 -0.33 0.02 0.08 0.20 0.00
## unit.crosseducation 26 100 -0.33 0.00 -0.34 -0.33 0.00 -0.35 -0.32 0.02 0.24 -0.24 0.00
## unit.crosswellness 27 100 -0.05 0.00 -0.05 -0.05 0.00 -0.06 -0.04 0.02 0.38 -0.09 0.00
## unit.crossexer 28 100 -0.05 0.00 -0.05 -0.05 0.00 -0.06 -0.03 0.02 0.43 0.18 0.00
## unit.crosssmoke 29 100 -0.04 0.00 -0.04 -0.04 0.00 -0.05 -0.03 0.02 0.42 -0.17 0.00
## unit.crossER 30 100 -0.04 0.00 -0.04 -0.04 0.00 -0.05 -0.03 0.02 0.42 -0.24 0.00
## factor.crossage 31 100 -0.17 0.00 -0.17 -0.17 0.00 -0.19 -0.16 0.03 -0.14 0.09 0.00
## factor.crosssex 32 100 -0.17 0.00 -0.17 -0.17 0.00 -0.19 -0.16 0.03 -0.26 0.03 0.00
## factor.crosshealth 33 100 -0.02 0.00 -0.02 -0.02 0.00 -0.02 0.00 0.02 0.33 -0.39 0.00
## factor.crossp1edu 34 100 -0.01 0.00 -0.01 -0.01 0.00 -0.02 0.00 0.02 0.18 -0.47 0.00
## factor.crossp2edu 35 100 -0.18 0.00 -0.18 -0.18 0.00 -0.19 -0.17 0.02 0.24 -0.47 0.00
## factor.crosseducation 36 100 -0.18 0.00 -0.18 -0.18 0.00 -0.19 -0.17 0.02 0.13 0.22 0.00
## factor.crosswellness 37 100 0.06 0.00 0.06 0.06 0.00 0.05 0.06 0.02 0.05 -0.45 0.00
## factor.crossexer 38 100 0.05 0.00 0.05 0.05 0.00 0.04 0.06 0.02 0.03 -0.43 0.00
## factor.crosssmoke 39 100 0.12 0.00 0.12 0.12 0.00 0.11 0.13 0.02 -0.41 0.39 0.00
## factor.crossER 40 100 0.12 0.00 0.12 0.12 0.00 0.11 0.13 0.02 -0.37 0.47 0.00
error.dots(summary,head=20,tail=20,main ="Unit weighted and factor scores derivation and cross validation for Neuroticism (with 95% confidence )") #show it
abline(v=0)