See the accompanying slides at https://personality-project.org/courses/350/350.manipulating.data.pdf and https://personality-project.org/courses/350/350.functions.pdf

psych and psychTools have been updated (to 2.3.5) Get the most recent version

The next lines (install.packages) is commented out, but you need to do it at least once. The

#install.packages("psych",repos="http://personality-project.org/r", type="source")
#install.packages("psychTools",repos="http://personality-project.org/r", type="source")

library(psych) #make it active
library(psychTools)
sessionInfo()   #psych should be 2.3.5 and psychTools should be 2.3.5
## R Under development (unstable) (2023-03-17 r83997)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.3.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## 
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Chicago
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] psychTools_2.3.5 psych_2.3.5     
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.31   R6_2.5.1        fastmap_1.1.1   xfun_0.38       lattice_0.20-45 cachem_1.0.7   
##  [7] parallel_4.3.0  knitr_1.42      foreign_0.8-84  htmltools_0.5.5 rmarkdown_2.21  cli_3.6.1      
## [13] grid_4.3.0      sass_0.4.5      jquerylib_0.1.4 mnormt_2.1.1    compiler_4.3.0  rstudioapi_0.14
## [19] tools_4.3.0     nlme_3.1-162    evaluate_0.20   bslib_0.4.2     yaml_2.3.7      rlang_1.1.0    
## [25] jsonlite_1.8.4

Two functions have been added to psychTools

vJoin and recode

vJoin (to join two objects by variable names)

x <- matrix(1:30, ncol=5)
y <- matrix(1:40, ncol=8)
x
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    7   13   19   25
## [2,]    2    8   14   20   26
## [3,]    3    9   15   21   27
## [4,]    4   10   16   22   28
## [5,]    5   11   17   23   29
## [6,]    6   12   18   24   30
y
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,]    1    6   11   16   21   26   31   36
## [2,]    2    7   12   17   22   27   32   37
## [3,]    3    8   13   18   23   28   33   38
## [4,]    4    9   14   19   24   29   34   39
## [5,]    5   10   15   20   25   30   35   40
xy <- vJoin(x,y)
xy
##     x1 x2 x3 x4 x5 y1 y2 y3 y4 y5 y6 y7 y8
## Sx1  1  7 13 19 25 NA NA NA NA NA NA NA NA
## Sx2  2  8 14 20 26 NA NA NA NA NA NA NA NA
## Sx3  3  9 15 21 27 NA NA NA NA NA NA NA NA
## Sx4  4 10 16 22 28 NA NA NA NA NA NA NA NA
## Sx5  5 11 17 23 29 NA NA NA NA NA NA NA NA
## Sx6  6 12 18 24 30 NA NA NA NA NA NA NA NA
## Sy1 NA NA NA NA NA  1  6 11 16 21 26 31 36
## Sy2 NA NA NA NA NA  2  7 12 17 22 27 32 37
## Sy3 NA NA NA NA NA  3  8 13 18 23 28 33 38
## Sy4 NA NA NA NA NA  4  9 14 19 24 29 34 39
## Sy5 NA NA NA NA NA  5 10 15 20 25 30 35 40
headTail(xy, top=2, bottom=2)
##       x1   x2   x3   x4   x5   y1   y2   y3   y4   y5   y6   y7   y8
## Sx1    1    7   13   19   25 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## Sx2    2    8   14   20   26 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...
## Sy4 <NA> <NA> <NA> <NA> <NA>    4    9   14   19   24   29   34   39
## Sy5 <NA> <NA> <NA> <NA> <NA>    5   10   15   20   25   30   35   40
XY <- vJoin(x,y,  cnames=FALSE)
XY
##     x1 x2 x3 x4 x5 x6 x7 x8
## Sx1  1  7 13 19 25 NA NA NA
## Sx2  2  8 14 20 26 NA NA NA
## Sx3  3  9 15 21 27 NA NA NA
## Sx4  4 10 16 22 28 NA NA NA
## Sx5  5 11 17 23 29 NA NA NA
## Sx6  6 12 18 24 30 NA NA NA
## Sy1  1  6 11 16 21 26 31 36
## Sy2  2  7 12 17 22 27 32 37
## Sy3  3  8 13 18 23 28 33 38
## Sy4  4  9 14 19 24 29 34 39
## Sy5  5 10 15 20 25 30 35 40

Examine vJoin

?vJoin  #to see the help file
vJoin    #to see the code
## function (x, y, rnames = TRUE, cnames = TRUE) 
## {
##     n.x <- NCOL(x)
##     n.y <- NCOL(y)
##     n.obsx <- NROW(x)
##     n.obsy <- NROW(y)
##     if (is.null(colnames(x))) 
##         colnames(x) <- paste0("x", 1:n.x)
##     if (is.null(colnames(y))) 
##         if (cnames) {
##             colnames(y) <- paste0("y", 1:n.y)
##         }
##         else {
##             colnames(y) <- paste0("x", 1:n.y)
##         }
##     if (is.null(rownames(x))) 
##         rownames(x) <- paste0("Sx", 1:n.obsx)
##     if (is.null(rownames(y))) 
##         rownames(y) <- paste0("Sy", 1:n.obsy)
##     xy <- unique(c(colnames(x), colnames(y)))
##     n.xy.col <- length(xy)
##     if (!rnames) {
##         new.names <- n.obsx + 1:n.obsy
##         rownames(y) <- new.names
##     }
##     xy.rows <- unique(c(rownames(x), rownames(y)))
##     n.xy.rows <- length(xy.rows)
##     new <- data.frame(matrix(NA, ncol = n.xy.col, nrow = n.xy.rows))
##     colnames(new) <- xy
##     rownames(new) <- xy.rows
##     new[rownames(x), colnames(x)] <- x[rownames(x), colnames(x)]
##     new[rownames(y), colnames(y)] <- y[rownames(y), colnames(y)]
##     return(new)
## }
## <bytecode: 0x13a35e068>
## <environment: namespace:psychTools>

Another example (with variable names already)

temp <- bfi[1:10,1:6]  #a small subset of bfi
temp2 <- bfi[5:15,4:8] #an overlapping subset
temp12 <- vJoin(temp,temp2)
temp12
##       A1 A2 A3 A4 A5 C1 C2 C3
## 61617  2  4  3  4  4  2 NA NA
## 61618  2  4  5  2  5  5 NA NA
## 61620  5  4  5  4  4  4 NA NA
## 61621  4  4  6  5  5  4 NA NA
## 61622  2  3  3  4  5  4  4  5
## 61623  6  6  5  6  5  6  6  6
## 61624  2  5  5  3  5  5  4  4
## 61629  4  3  1  5  1  3  2  4
## 61630  4  3  6  3  3  6  6  3
## 61633  2  5  6  6  5  6  5  6
## 61634 NA NA NA  6  5  4  3  5
## 61636 NA NA NA  5  5  5  4  5
## 61637 NA NA NA  6  4  5  4  3
## 61639 NA NA NA  6  6  4  4  4
## 61640 NA NA NA  2  1  5  5  5

Recoding data

If some data are character with levels that do not make sense in terms of some ordinal variable, we can recode them.

First we create a file with levels of education that we think make sense

education <- cs(high_school, some_college, college, in_graduate_school, graduate_degree)
temp <- recode(bfi,where=27, isvalue=1:5,newvalue=education)
bfi[50:70,27] #show the education levels
##  [1] 3 3 4 4 3 4 5 3 5 2 5 1 1 4 3 5 3 4 3 5 2
temp[50:70,27] #they correspond
##  [1] "college"            "college"            "in_graduate_school" "in_graduate_school" "college"           
##  [6] "in_graduate_school" "graduate_degree"    "college"            "graduate_degree"    "some_college"      
## [11] "graduate_degree"    "high_school"        "high_school"        "in_graduate_school" "college"           
## [16] "graduate_degree"    "college"            "in_graduate_school" "college"            "graduate_degree"   
## [21] "some_college"
table(bfi[27])
## education
##    1    2    3    4    5 
##  224  292 1249  394  418
table(temp[27])   #but these are not ordered by the numeric level
## education
##            college    graduate_degree        high_school in_graduate_school       some_college 
##               1249                418                224                394                292
#convert to numeric

When we describe the data, it is even worse

describe(bfi[26:28])
##           vars    n  mean    sd median trimmed   mad min max range  skew kurtosis   se
## gender       1 2800  1.67  0.47      2    1.71  0.00   1   2     1 -0.73    -1.47 0.01
## education    2 2577  3.19  1.11      3    3.22  1.48   1   5     4 -0.05    -0.32 0.02
## age          3 2800 28.78 11.13     26   27.43 10.38   3  86    83  1.02     0.56 0.21
describe(temp[26:28])
##            vars    n  mean    sd median trimmed   mad min max range  skew kurtosis   se
## gender        1 2800  1.67  0.47      2    1.71  0.00   1   2     1 -0.73    -1.47 0.01
## education*    2 2577  2.25  1.46      2    2.06  1.48   1   5     4  0.73    -1.01 0.03
## age           3 2800 28.78 11.13     26   27.43 10.38   3  86    83  1.02     0.56 0.21
temp2 <- char2numeric(temp)
describe(temp2[26:28])
##            vars    n  mean    sd median trimmed   mad min max range  skew kurtosis   se
## gender        1 2800  1.67  0.47      2    1.71  0.00   1   2     1 -0.73    -1.47 0.01
## education*    2 2577  2.25  1.46      2    2.06  1.48   1   5     4  0.73    -1.01 0.03
## age           3 2800 28.78 11.13     26   27.43 10.38   3  86    83  1.02     0.56 0.21

How can recode these to make sense?

new.education <- c(3,5,1,4,2)

temp3 <- recode(temp2,27,isvalue=1:6, newvalue=new.education)
describe(temp3[26:28])
##            vars    n  mean    sd median trimmed   mad min max range  skew kurtosis   se
## gender        1 2800  1.67  0.47      2    1.71  0.00   1   2     1 -0.73    -1.47 0.01
## education*    2 2577  3.19  1.11      3    3.22  1.48   1   5     4 -0.05    -0.32 0.02
## age           3 2800 28.78 11.13     26   27.43 10.38   3  86    83  1.02     0.56 0.21

Writing functions

Three basic approaches to using R

  1. Do everything from console – run each line as you think about it.

Basic step through your operations, use R like a desk calculator Try a command (calling some functions from a package),

  1. Use the text window and write out a few lines at a time and then run them (ala what we have been doing in class)

Keep a list of your most useful set of lines Annotate them to remind you what you have done Use RMarkdown to annotate your scripts and to show your output

  1. Write little (initially) functions to which you can pass parameters and make into your own library of useful functions You can then source these functions when you do your work,.

Some basic functions to help you get and move around your data

fn <- file.choose() #dynamically open your directory to find a file

file.choose(new=TRUE) create a new file. Uses the normal computer search functions, but the creates a new file. This might be just available in regular R on a Mac, but not RStudio.

read.table(filename) Read tabular input from the file location .

read.csv(filename) reads a comma separated file

read.delim(filename) reads a tab delimited file

load(filename) “loads” objects from an .Rds file

read.file(filename)* combines and appropriate read command (e.g., , , , etc.).

Learn how functions work by examining the code in functions

One way to learn coding is to read functions. Try to understand how they work.

As an example of reading files and using functions, lets look at the read.file function. Note that default values are specified but could be changed.

Examine the effect of two functions (file_ext, switch)

There are a number of different ways to read files, these depend upon the specific type of file.

We look for many possible suffixes. Others can easily be added if we think we need to add them.

We use a function (file_ext ) from the tools' package. This finds out what the suffix (.csv, .sav, .txt, ...) is and then is used in theswitch’ function.

read.file #lets look at what it does
## function (file = NULL, header = TRUE, use.value.labels = FALSE, 
##     to.data.frame = TRUE, sep = ",", quote = "\"", widths = NULL, 
##     f = NULL, filetype = NULL, ...) 
## {
##     if (missing(f) && missing(file)) 
##         f <- file.choose()
##     if (missing(f) && !missing(file)) 
##         f <- file
##     suffix <- file_ext(f)
##     if (!missing(filetype)) 
##         suffix <- filetype
##     if (suffix %in% c("txt", "TXT", "text", "data", "dat", "DAT")) 
##         suffix <- "txt"
##     if (suffix %in% c("R", "r")) 
##         suffix <- "R"
##     if (suffix %in% c("rds", "RDS", "Rds")) 
##         suffix <- "rds"
##     if (suffix %in% c("Rda", "rda", "Rdata", "RData", "rdata")) 
##         suffix <- "Rda"
##     if (suffix %in% c("SYD", "syd", "sys", "SYS")) 
##         suffix <- "SYD"
##     if (!missing(widths)) {
##         result <- read.fwf(f, widths, ...)
##         message("The fixed width file ", f, "has been loaded.")
##     }
##     else {
##         switch(suffix, sav = {
##             result <- read.spss(f, use.value.labels = use.value.labels, 
##                 to.data.frame = to.data.frame)
##             message("Data from the SPSS sav file ", f, " has been loaded.")
##         }, csv = {
##             result <- read.table(f, header = header, sep = sep, 
##                 quote = quote, ...)
##             message("Data from the .csv file ", f, " has been loaded.")
##         }, tab = {
##             result <- read.table(f, header = header, sep = "\t", 
##                 ...)
##             message("Data from the .tab file ", f, " has been loaded.")
##         }, txt = {
##             result <- read.table(f, header = header, ...)
##             message("Data from the .txt file ", f, " has been loaded.")
##         }, rds = {
##             result <- try(readRDS(f, ...), silent = TRUE)
##             if (inherits(result, "try-error")) {
##                 result <- f
##                 message("I had problems reading this file, \ntry  load('", 
##                   f, "') instead. \nCaution, this might replace an object currently in your environment.")
##             } else {
##                 message("File ", f, " has been loaded.")
##             }
##         }, R = {
##             result <- dget(f, ...)
##             message("File ", f, " has been loaded.")
##         }, XPT = {
##             result <- read.xport(f, ...)
##             message("File ", f, " has been loaded.")
##         }, xpt = {
##             result <- read.xport(f, ...)
##             message("File ", f, " has been loaded.")
##         }, Rda = {
##             result <- f
##             message("To load this file (or these files) you need to load('", 
##                 f, "')   \nCaution, this might replace an object currently in your environment.")
##         }, SYD = {
##             result <- read.systat(f, to.data.frame = to.data.frame)
##             message("Data from the systat SYD file ", f, " has been loaded.")
##         }, jmp = {
##             result <- f
##             message("I am sorrry.  To read this .jmp file, it must first be saved as either a \"txt\" or \"csv\" file. If you insist on using SAS formats, try .xpt or .XPT")
##         }, sas7bdat = {
##             result <- f
##             message("I am sorry.  To read this .sas7bdat file, it must first be saved as either a xpt, or XPT file in SAS, or as a \"txt\" or \"csv\" file. ?read.ssd in foreign for help.")
##         }, {
##             message("I  am sorry. \nI can not tell from the suffix what file type is this.  Rather than try to read it, I will let you specify a better format.\n You might try specifying the filetype")
##         })
##     }
##     return(result)
## }
## <bytecode: 0x10cf00668>
## <environment: namespace:psychTools>

read.file can be used for local files or remote locations.

file.choose will search your directory for a file. file_ext then examines that file name to identify what it is.

We do not run this function because it is dynamic and requires user input.

library(tools)
file.name <- "/Users/WR/Box Sync/pmc_folder/Courses.18/350/datasets/cars.rda"
file.name2 <- "/Users/WR/Box Sync/pmc_folder/Courses.18/350/datasets/glbwarm.sav"
file.name3 <- "/Users/WR/Box Sync/pmc_folder/Courses.18/350/datasets/simulation.txt"
suffix <- file_ext(file.name)
suffix2 <- file_ext(file.name2)
suffix3 <- file_ext(file.name3)
suffix;  suffix2; suffix3   #we can string several lines together using ; 
## [1] "rda"
## [1] "sav"
## [1] "txt"
# now read these files
cars <- read.file(file.name)
## To load this file (or these files) you need to load('/Users/WR/Box Sync/pmc_folder/Courses.18/350/datasets/cars.rda')   
## Caution, this might replace an object currently in your environment.
dim(cars)
## NULL
#glb  <- read.file(file.name2)
#dim(glb)

Basic descriptives of a file

ls()    #show the objects in my workspace
##  [1] "cars"          "education"     "file.name"     "file.name2"    "file.name3"    "new.education"
##  [7] "suffix"        "suffix2"       "suffix3"       "temp"          "temp12"        "temp2"        
## [13] "temp3"         "x"             "xy"            "XY"            "y"
#rm(list=ls())   #CAREFUL, this will remove all the objects in your workspace!
dim(sat.act)  #how many rows and columns in the sat.act object
## [1] 700   6
names(sat.act)
## [1] "gender"    "education" "age"       "ACT"       "SATV"      "SATQ"
colnames(sat.act)
## [1] "gender"    "education" "age"       "ACT"       "SATV"      "SATQ"
head(sat.act)  #show the first 5 lines
##       gender education age ACT SATV SATQ
## 29442      2         3  19  24  500  500
## 29457      2         3  23  35  600  500
## 29498      2         3  20  21  480  470
## 29503      1         4  27  26  550  520
## 29504      1         2  33  31  600  550
## 29518      1         5  26  28  640  640
tail(sat.act) #show the last lines
##       gender education age ACT SATV SATQ
## 39904      2         3  20  26  710  680
## 39915      1         3  25  30  500  500
## 39937      1         4  40  27  613  630
## 39951      2         3  24  31  700  630
## 39961      1         4  35  32  700  780
## 39985      1         5  25  25  600  600
headTail(sat.act)  #combine head and tail
##       gender education age ACT SATV SATQ
## 29442      2         3  19  24  500  500
## 29457      2         3  23  35  600  500
## 29498      2         3  20  21  480  470
## 29503      1         4  27  26  550  520
## ...      ...       ... ... ...  ...  ...
## 39937      1         4  40  27  613  630
## 39951      2         3  24  31  700  630
## 39961      1         4  35  32  700  780
## 39985      1         5  25  25  600  600
mod <- lm(SATV ~ gender + age + education, data =sat.act)
length(mod) #how many elements are in mod?
## [1] 12
names(mod)  #what are their names
##  [1] "coefficients"  "residuals"     "effects"       "rank"          "fitted.values" "assign"       
##  [7] "qr"            "df.residual"   "xlevels"       "call"          "terms"         "model"
str(mod)  #what is the structure of mod
## List of 12
##  $ coefficients : Named num [1:4] 628.26 -7.08 -1.19 8.23
##   ..- attr(*, "names")= chr [1:4] "(Intercept)" "gender" "age" "education"
##  $ residuals    : Named num [1:700] -116.22 -11.47 -135.03 -72.02 1.56 ...
##   ..- attr(*, "names")= chr [1:700] "29442" "29457" "29498" "29503" ...
##  $ effects      : Named num [1:700] -16198.2 56.25 -127.63 -257.53 -1.68 ...
##   ..- attr(*, "names")= chr [1:700] "(Intercept)" "gender" "age" "education" ...
##  $ rank         : int 4
##  $ fitted.values: Named num [1:700] 616 611 615 622 598 ...
##   ..- attr(*, "names")= chr [1:700] "29442" "29457" "29498" "29503" ...
##  $ assign       : int [1:4] 0 1 2 3
##  $ qr           :List of 5
##   ..$ qr   : num [1:700, 1:4] -26.4575 0.0378 0.0378 0.0378 0.0378 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:700] "29442" "29457" "29498" "29503" ...
##   .. .. ..$ : chr [1:4] "(Intercept)" "gender" "age" "education"
##   .. ..- attr(*, "assign")= int [1:4] 0 1 2 3
##   ..$ qraux: num [1:4] 1.04 1.03 1.02 1.03
##   ..$ pivot: int [1:4] 1 2 3 4
##   ..$ tol  : num 1e-07
##   ..$ rank : int 4
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 696
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = SATV ~ gender + age + education, data = sat.act)
##  $ terms        :Classes 'terms', 'formula'  language SATV ~ gender + age + education
##   .. ..- attr(*, "variables")= language list(SATV, gender, age, education)
##   .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:4] "SATV" "gender" "age" "education"
##   .. .. .. ..$ : chr [1:3] "gender" "age" "education"
##   .. ..- attr(*, "term.labels")= chr [1:3] "gender" "age" "education"
##   .. ..- attr(*, "order")= int [1:3] 1 1 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(SATV, gender, age, education)
##   .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "numeric" "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:4] "SATV" "gender" "age" "education"
##  $ model        :'data.frame':   700 obs. of  4 variables:
##   ..$ SATV     : int [1:700] 500 600 480 550 600 640 610 520 400 730 ...
##   ..$ gender   : int [1:700] 2 2 2 1 1 1 2 1 2 2 ...
##   ..$ age      : int [1:700] 19 23 20 27 33 26 30 19 23 40 ...
##   ..$ education: int [1:700] 3 3 3 4 2 5 5 3 4 5 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language SATV ~ gender + age + education
##   .. .. ..- attr(*, "variables")= language list(SATV, gender, age, education)
##   .. .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:4] "SATV" "gender" "age" "education"
##   .. .. .. .. ..$ : chr [1:3] "gender" "age" "education"
##   .. .. ..- attr(*, "term.labels")= chr [1:3] "gender" "age" "education"
##   .. .. ..- attr(*, "order")= int [1:3] 1 1 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(SATV, gender, age, education)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "numeric" "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:4] "SATV" "gender" "age" "education"
##  - attr(*, "class")= chr "lm"
summary(mod)   #summary returns summary statistics appropriate for the object
## 
## Call:
## lm(formula = SATV ~ gender + age + education, data = sat.act)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -411.51  -67.70   12.16   82.62  207.28 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  628.263     19.399  32.386   <2e-16 ***
## gender        -7.082      8.971  -0.789   0.4302    
## age           -1.188      0.538  -2.208   0.0276 *  
## education      8.229      3.598   2.287   0.0225 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 112.6 on 696 degrees of freedom
## Multiple R-squared:  0.009626,   Adjusted R-squared:  0.005357 
## F-statistic: 2.255 on 3 and 696 DF,  p-value: 0.08076
summary(sat.act) #but they are not very useful summaries for data
##      gender        education          age             ACT             SATV            SATQ      
##  Min.   :1.000   Min.   :0.000   Min.   :13.00   Min.   : 3.00   Min.   :200.0   Min.   :200.0  
##  1st Qu.:1.000   1st Qu.:3.000   1st Qu.:19.00   1st Qu.:25.00   1st Qu.:550.0   1st Qu.:530.0  
##  Median :2.000   Median :3.000   Median :22.00   Median :29.00   Median :620.0   Median :620.0  
##  Mean   :1.647   Mean   :3.164   Mean   :25.59   Mean   :28.55   Mean   :612.2   Mean   :610.2  
##  3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:29.00   3rd Qu.:32.00   3rd Qu.:700.0   3rd Qu.:700.0  
##  Max.   :2.000   Max.   :5.000   Max.   :65.00   Max.   :36.00   Max.   :800.0   Max.   :800.0  
##                                                                                  NA's   :13
describe(sat.act) #more useful summary statistics
##           vars   n   mean     sd median trimmed    mad min max range  skew kurtosis   se
## gender       1 700   1.65   0.48      2    1.68   0.00   1   2     1 -0.61    -1.62 0.02
## education    2 700   3.16   1.43      3    3.31   1.48   0   5     5 -0.68    -0.07 0.05
## age          3 700  25.59   9.50     22   23.86   5.93  13  65    52  1.64     2.42 0.36
## ACT          4 700  28.55   4.82     29   28.84   4.45   3  36    33 -0.66     0.53 0.18
## SATV         5 700 612.23 112.90    620  619.45 118.61 200 800   600 -0.64     0.33 4.27
## SATQ         6 687 610.22 115.64    620  617.25 118.61 200 800   600 -0.59    -0.02 4.41
table(sat.act$education)  #break this down for one column
## 
##   0   1   2   3   4   5 
##  57  45  44 275 138 141
table(sat.act[c("education","gender")] )  #do the two way table
##          gender
## education   1   2
##         0  27  30
##         1  20  25
##         2  23  21
##         3  80 195
##         4  51  87
##         5  46  95
t <- table(sat.act[c("education","gender")] )  #do it again, but save it as an object
chisq.test(t)  #do the chi square on this table
## 
##  Pearson's Chi-squared test
## 
## data:  t
## X-squared = 16.085, df = 5, p-value = 0.006605

Creating matrices and data frames

Two of the most useful data structures are matrices and data.frames. Matrices are all of one type (logical, character, numeric), while data frames can be mixtures of all data types.

Create a small matrix with the first 1,000 digits.

x <- matrix(1:1000,ncol=10)
y <- matrix(1:1000,nrow=100,byrow=TRUE)  #specify the number of rows, 
headTail(x)
##      X1  X2  X3  X4  X5  X6  X7  X8  X9  X10
## 1     1 101 201 301 401 501 601 701 801  901
## 2     2 102 202 302 402 502 602 702 802  902
## 3     3 103 203 303 403 503 603 703 803  903
## 4     4 104 204 304 404 504 604 704 804  904
## ... ... ... ... ... ... ... ... ... ...  ...
## 97   97 197 297 397 497 597 697 797 897  997
## 98   98 198 298 398 498 598 698 798 898  998
## 99   99 199 299 399 499 599 699 799 899  999
## 100 100 200 300 400 500 600 700 800 900 1000
headTail(y)  #note how the elements are in a different order if we specify byrow
##      X1  X2  X3  X4  X5  X6  X7  X8  X9  X10
## 1     1   2   3   4   5   6   7   8   9   10
## 2    11  12  13  14  15  16  17  18  19   20
## 3    21  22  23  24  25  26  27  28  29   30
## 4    31  32  33  34  35  36  37  38  39   40
## ... ... ... ... ... ... ... ... ... ...  ...
## 97  961 962 963 964 965 966 967 968 969  970
## 98  971 972 973 974 975 976 977 978 979  980
## 99  981 982 983 984 985 986 987 988 989  990
## 100 991 992 993 994 995 996 997 998 999 1000
xy <- cbind(x,y)  #column bind them
dim(xy)  #what are the dimensions
## [1] 100  20
yx <- rbind(x,y)  #row bind the data sets
dim(yx)
## [1] 200  10
headTail(xy)
##      X1  X2  X3  X4  X5  X6  X7  X8  X9  X10 X11 X12 X13 X14 X15 X16 X17 X18 X19  X20
## 1     1 101 201 301 401 501 601 701 801  901   1   2   3   4   5   6   7   8   9   10
## 2     2 102 202 302 402 502 602 702 802  902  11  12  13  14  15  16  17  18  19   20
## 3     3 103 203 303 403 503 603 703 803  903  21  22  23  24  25  26  27  28  29   30
## 4     4 104 204 304 404 504 604 704 804  904  31  32  33  34  35  36  37  38  39   40
## ... ... ... ... ... ... ... ... ... ...  ... ... ... ... ... ... ... ... ... ...  ...
## 97   97 197 297 397 497 597 697 797 897  997 961 962 963 964 965 966 967 968 969  970
## 98   98 198 298 398 498 598 698 798 898  998 971 972 973 974 975 976 977 978 979  980
## 99   99 199 299 399 499 599 699 799 899  999 981 982 983 984 985 986 987 988 989  990
## 100 100 200 300 400 500 600 700 800 900 1000 991 992 993 994 995 996 997 998 999 1000
headTail(yx)
##      X1  X2  X3  X4  X5  X6  X7  X8  X9  X10
## 1     1 101 201 301 401 501 601 701 801  901
## 2     2 102 202 302 402 502 602 702 802  902
## 3     3 103 203 303 403 503 603 703 803  903
## 4     4 104 204 304 404 504 604 704 804  904
## ... ... ... ... ... ... ... ... ... ...  ...
## 197 961 962 963 964 965 966 967 968 969  970
## 198 971 972 973 974 975 976 977 978 979  980
## 199 981 982 983 984 985 986 987 988 989  990
## 200 991 992 993 994 995 996 997 998 999 1000

Matrices can be turned into data frames if we want

x.df <- data.frame(x)  
headTail(x.df)
##      X1  X2  X3  X4  X5  X6  X7  X8  X9  X10
## 1     1 101 201 301 401 501 601 701 801  901
## 2     2 102 202 302 402 502 602 702 802  902
## 3     3 103 203 303 403 503 603 703 803  903
## 4     4 104 204 304 404 504 604 704 804  904
## ... ... ... ... ... ... ... ... ... ...  ...
## 97   97 197 297 397 497 597 697 797 897  997
## 98   98 198 298 398 498 598 698 798 898  998
## 99   99 199 299 399 499 599 699 799 899  999
## 100 100 200 300 400 500 600 700 800 900 1000

But we can do things to dataframes we can not do to matrices

 condition <- gl(2, 8, labels = c("Control", "Treat"))
x.df <- data.frame(condition,results=rnorm(16))
x.df 
##    condition     results
## 1    Control  0.21436178
## 2    Control  2.40046205
## 3    Control -0.53549466
## 4    Control -0.34526300
## 5    Control  0.76010132
## 6    Control  1.13454875
## 7    Control -0.35354696
## 8    Control -2.02310623
## 9      Treat -0.70711836
## 10     Treat -0.22865065
## 11     Treat -2.36506374
## 12     Treat  0.76960056
## 13     Treat  0.48947885
## 14     Treat -0.52356413
## 15     Treat -1.22151863
## 16     Treat  0.04299917
describe(x.df)
##            vars  n  mean   sd median trimmed  mad   min max range skew kurtosis   se
## condition*    1 16  1.50 0.52   1.50    1.50 0.74  1.00 2.0  1.00 0.00    -2.12 0.13
## results       2 16 -0.16 1.17  -0.29   -0.18 0.95 -2.37 2.4  4.77 0.08    -0.21 0.29
nrow(x.df)  #how many rows
## [1] 16
ncol(x.df) #how many columns
## [1] 2

Some basic statistics

mean(x)
## [1] 500.5
mean(y)
## [1] 500.5
colMeans(x)
##  [1]  50.5 150.5 250.5 350.5 450.5 550.5 650.5 750.5 850.5 950.5
colMeans(y)
##  [1] 496 497 498 499 500 501 502 503 504 505
#use the apply function, applied to the columns of the x
cm <- apply(x,2,mean)   #same as column mean
cm
##  [1]  50.5 150.5 250.5 350.5 450.5 550.5 650.5 750.5 850.5 950.5
ssd <- apply(x,2,sd)  #column standard deviations
round(ssd,digits=2)
##  [1] 29.01 29.01 29.01 29.01 29.01 29.01 29.01 29.01 29.01 29.01
smed <- apply(x,2,median)
smed  #show the column medians
##  [1]  50.5 150.5 250.5 350.5 450.5 550.5 650.5 750.5 850.5 950.5

What is the structure of the result of an apply operation?

x <- matrix(1:1000,ncol=10)

"dumbfunction" <- function(x) {  #this one gives bad results? why?
  nvar <- ncol(x)
  nsub <- nrow(x)
  mean <- rep(NA,nvar)
  sum <- 0
  for(j in 1:nvar) {
    
     for (i in 1:nsub) {
       sum <- sum+ x[i,j]
     }
    
    mean[j] <- sum/nsub
  }
print(mean)
}

dumbfunction(x)  #why are these results non-sensical?
##  [1]   50.5  201.0  451.5  802.0 1252.5 1803.0 2453.5 3204.0 4054.5 5005.0

Examine the results. Do they pass a sanity test.

The previous results don’t make any sense. For how can the means exceed the values. We need to move where we zero out the counting.

"notquitsodumb" <- function(x) {  #this one gives better results? why?
  nvar <- ncol(x)
  nsub <- nrow(x)
  mean <- rep(NA,nvar)
 
  for(j in 1:nvar) {
    sum <- 0     #we set the counter to 0 for each variable
     for (i in 1:nsub) {
       sum <- sum+ x[i,j]
     }
    
    mean[j] <- sum/nsub
  }
print(mean)
}
notquitsodumb (x)
##  [1]  50.5 150.5 250.5 350.5 450.5 550.5 650.5 750.5 850.5 950.5

Simulation study

We want to compare the cross validities of using unit weighted scales (that is to say, just add them up) versus factor weighted scales (requires factor analysis and then uses the weights from the factor loadings to find the scores.) We are interested in what happens if we apply the same weights to a different sample.

We do this 100 times!

  1. the data set has 4,000 observations with 10 criteria and 135 personality items

  2. The Neuroticism scale of 14 items can be used as an example

  3. For each of 100 random samples of size 200 from the 4000,

Form unit weighted scores

Form factor scores (based upon a one dimensional factor analysis)

Find the validities of these scores for the 10 criteria

Cross validate on the other 3800 subjects

  1. Pool these 100 samples

  2. Graph the results

Simulation of 100 replications of scoring

 #use the spi data set    and do it for neurocitsm
N.items <- selectFromKeys(spi.keys["Neuro"])
spi.N <- spi[N.items]
N.keys <- spi.keys["Neuro"]
#spi.ext.crit <- data.frame(spi[,1:10],spi.E)
n.obs  <- 200
results <- list()
for(i in 1:100) {
ss <- sample (4000,n.obs,replace=FALSE)
f.ss <- fa(spi.N [ss,])  #find a one factor solution and score it
scores.ss <- data.frame(unit=scoreFast(N.keys,spi.N[ss,]),factor=f.ss$scores)
validities <- cor(scores.ss,spi[ss,1:10],use="pairwise")
cross <- data.frame(unit.cross=scoreFast(N.keys,spi.N[-ss,]),factor.cross = predict(f.ss,spi.N[-ss,]) )
cross.validities <- cor(cross,spi[-ss,1:10],use="pairwise" )
results[[i]]<-c(validities,cross.validities)
}

summary <- matrix(unlist(results),ncol=40,byrow=TRUE)
colnames(summary) <- c(paste0("unit",colnames(spi)[1:10]) , paste0("factor",colnames(spi)[1:10]),paste0("unit.cross",colnames(spi)[1:10]), paste0("factor.cross",colnames(spi)[1:10]))

describe(summary)  #for textual output
##                       vars   n  mean   sd median trimmed  mad   min   max range  skew kurtosis   se
## unitage                  1 100 -0.17 0.08  -0.16   -0.17 0.07 -0.35  0.01  0.36 -0.05    -0.36 0.01
## unitsex                  2 100 -0.17 0.08  -0.17   -0.18 0.07 -0.36  0.00  0.36  0.01    -0.34 0.01
## unithealth               3 100  0.24 0.07   0.25    0.25 0.07  0.03  0.40  0.38 -0.39     0.10 0.01
## unitp1edu                4 100  0.25 0.07   0.26    0.26 0.07  0.03  0.40  0.37 -0.36     0.15 0.01
## unitp2edu                5 100 -0.34 0.07  -0.34   -0.34 0.06 -0.51 -0.18  0.34  0.04    -0.03 0.01
## uniteducation            6 100 -0.34 0.07  -0.33   -0.34 0.06 -0.51 -0.18  0.33 -0.04    -0.02 0.01
## unitwellness             7 100 -0.05 0.08  -0.04   -0.05 0.07 -0.27  0.11  0.37 -0.35    -0.21 0.01
## unitexer                 8 100 -0.05 0.08  -0.04   -0.05 0.08 -0.27  0.10  0.38 -0.32    -0.32 0.01
## unitsmoke                9 100 -0.04 0.08  -0.03   -0.04 0.08 -0.25  0.12  0.37 -0.39    -0.37 0.01
## unitER                  10 100 -0.04 0.08  -0.03   -0.04 0.08 -0.25  0.11  0.36 -0.43    -0.29 0.01
## factorage               11 100 -0.18 0.08  -0.18   -0.18 0.09 -0.40  0.11  0.51  0.18     0.24 0.01
## factorsex               12 100 -0.17 0.08  -0.17   -0.17 0.10 -0.38  0.11  0.49  0.17     0.16 0.01
## factorhealth            13 100 -0.02 0.07  -0.02   -0.02 0.09 -0.22  0.11  0.33 -0.30    -0.58 0.01
## factorp1edu             14 100 -0.02 0.07  -0.02   -0.02 0.08 -0.22  0.11  0.33 -0.22    -0.58 0.01
## factorp2edu             15 100 -0.17 0.07  -0.17   -0.17 0.08 -0.37  0.00  0.37 -0.23    -0.42 0.01
## factoreducation         16 100 -0.18 0.07  -0.17   -0.17 0.07 -0.36 -0.01  0.35 -0.27    -0.39 0.01
## factorwellness          17 100  0.06 0.07   0.06    0.06 0.06 -0.10  0.21  0.31 -0.16    -0.50 0.01
## factorexer              18 100  0.05 0.07   0.05    0.05 0.06 -0.10  0.20  0.30 -0.17    -0.43 0.01
## factorsmoke             19 100  0.12 0.08   0.12    0.13 0.08 -0.07  0.34  0.41 -0.02    -0.28 0.01
## factorER                20 100  0.12 0.08   0.13    0.12 0.08 -0.07  0.32  0.39 -0.08    -0.28 0.01
## unit.crossage           21 100 -0.17 0.00  -0.17   -0.17 0.00 -0.18 -0.16  0.02  0.15    -0.40 0.00
## unit.crosssex           22 100 -0.18 0.00  -0.18   -0.18 0.00 -0.19 -0.17  0.03  0.35     0.03 0.00
## unit.crosshealth        23 100  0.24 0.00   0.24    0.24 0.00  0.24  0.26  0.02  0.42     0.18 0.00
## unit.crossp1edu         24 100  0.25 0.01   0.25    0.25 0.01  0.24  0.27  0.03  0.29    -0.33 0.00
## unit.crossp2edu         25 100 -0.34 0.00  -0.34   -0.34 0.00 -0.35 -0.33  0.02  0.08     0.20 0.00
## unit.crosseducation     26 100 -0.33 0.00  -0.34   -0.33 0.00 -0.35 -0.32  0.02  0.24    -0.24 0.00
## unit.crosswellness      27 100 -0.05 0.00  -0.05   -0.05 0.00 -0.06 -0.04  0.02  0.38    -0.09 0.00
## unit.crossexer          28 100 -0.05 0.00  -0.05   -0.05 0.00 -0.06 -0.03  0.02  0.43     0.18 0.00
## unit.crosssmoke         29 100 -0.04 0.00  -0.04   -0.04 0.00 -0.05 -0.03  0.02  0.42    -0.17 0.00
## unit.crossER            30 100 -0.04 0.00  -0.04   -0.04 0.00 -0.05 -0.03  0.02  0.42    -0.24 0.00
## factor.crossage         31 100 -0.17 0.00  -0.17   -0.17 0.00 -0.19 -0.16  0.03 -0.14     0.09 0.00
## factor.crosssex         32 100 -0.17 0.00  -0.17   -0.17 0.00 -0.19 -0.16  0.03 -0.26     0.03 0.00
## factor.crosshealth      33 100 -0.02 0.00  -0.02   -0.02 0.00 -0.02  0.00  0.02  0.33    -0.39 0.00
## factor.crossp1edu       34 100 -0.01 0.00  -0.01   -0.01 0.00 -0.02  0.00  0.02  0.18    -0.47 0.00
## factor.crossp2edu       35 100 -0.18 0.00  -0.18   -0.18 0.00 -0.19 -0.17  0.02  0.24    -0.47 0.00
## factor.crosseducation   36 100 -0.18 0.00  -0.18   -0.18 0.00 -0.19 -0.17  0.02  0.13     0.22 0.00
## factor.crosswellness    37 100  0.06 0.00   0.06    0.06 0.00  0.05  0.06  0.02  0.05    -0.45 0.00
## factor.crossexer        38 100  0.05 0.00   0.05    0.05 0.00  0.04  0.06  0.02  0.03    -0.43 0.00
## factor.crosssmoke       39 100  0.12 0.00   0.12    0.12 0.00  0.11  0.13  0.02 -0.41     0.39 0.00
## factor.crossER          40 100  0.12 0.00   0.12    0.12 0.00  0.11  0.13  0.02 -0.37     0.47 0.00
error.dots(summary,head=20,tail=20,main ="Unit weighted and factor scores derivation and cross validation for Neuroticism (with 95% confidence )")   #show it 
abline(v=0)