% \VignetteIndexEntry{Introduction to the psych package} % \VignettePackage{psych} % \VignetteKeywords{multivariate} % \VignetteKeyword{models} % \VignetteKeyword{Hplot} %\VignetteDepends{psych} %\documentclass[doc]{apa} %\VignetteEncoding{UTF-8} \documentclass[11pt]{article} %\documentclass[11pt]{amsart} \usepackage{geometry} % See geometry.pdf to learn the layout options. There are lots. \geometry{letterpaper} % ... or a4paper or a5paper or ... %\geometry{landscape} % Activate for for rotated page geometry \usepackage[parfill]{parskip} % Activate to begin paragraphs with an empty line rather than an indent \usepackage{graphicx} \usepackage{amssymb} \usepackage{epstopdf} \usepackage{mathptmx} \usepackage{helvet} \usepackage{courier} \usepackage{epstopdf} \usepackage{makeidx} % allows index generation \usepackage[authoryear,round]{natbib} %\usepackage{gensymb} %\usepackage{longtable} %\usepackage{geometry} \usepackage{amssymb} \usepackage{amsmath} %\usepackage{siunitx} %\DeclareGraphicsRule{.tif}{png}{.png}{`convert #1 `dirname #1`/`basename #1 .tif`.png} \usepackage{Sweave} % (replaced with knirR ccde %\usepackage{/Volumes/'Macintosh HD'/Library/Frameworks/R.framework/Versions/2.13/Resources/share/texmf/tex/latex/Sweave} %\usepackage[ae]{Rd} %\usepackage[usenames]{color} %\usepackage{setspace} \bibstyle{apacite} \bibliographystyle{apa} %this one plus author year seems to work? %\usepackage{hyperref} \usepackage[colorlinks=true,citecolor=blue]{hyperref} %this makes reference links hyperlinks in pdf! \usepackage{fancyvrb} %allows us to defineEnvironments \DeclareGraphicsRule{.tif}{png}{.png}{`convert #1 `dirname #1`/`basename #1 .tif`.png} \usepackage{multicol} % used for the two-column index \usepackage[bottom]{footmisc}% places footnotes at page bottom \let\proglang=\textsf \newcommand{\R}{\proglang{R}} %\newcommand{\pkg}[1]{{\normalfont\fontseries{b}\selectfont #1}} \newcommand{\Rfunction}[1]{{\texttt{#1}}} \newcommand{\fun}[1]{{\texttt{#1}\index{#1}\index{R function!#1}}} \newcommand{\pfun}[1]{{\texttt{#1}\index{#1}\index{R function!#1}\index{R function!psych package!#1}}}\newcommand{\Rc}[1]{{\texttt{#1}}} %R command same as Robject \newcommand{\Robject}[1]{{\texttt{#1}}} \newcommand{\Rpkg}[1]{{\textit{#1}\index{#1}\index{R package!#1}}} %different from pkg - which is better? \newcommand{\iemph}[1]{{\emph{#1}\index{#1}}} \newcommand{\wrc}[1]{\marginpar{\textcolor{blue}{#1}}} %bill's comments \newcommand{\wra}[1]{\textcolor{blue}{#1}} %bill's comments \newcommand{\ve}[1]{{\textbf{#1}}} %trying to get a vector command \DefineVerbatimEnvironment{Sinput}{Verbatim} {fontseries=b, fontsize=\scriptsize, frame=single, label=\fbox{R code}, framesep=5mm} \DefineVerbatimEnvironment{Rinput}{Verbatim} {fontseries=b, fontsize=\scriptsize, frame=single, label=\fbox{R code}, framesep=5mm} \makeindex % used for the subject index \title{An introduction to the psych package: Part I: \\ data entry and data description} \author{William Revelle\\Department of Psychology\\Northwestern University} %\affiliation{Northwestern University} %\acknowledgements{Written to accompany the psych package. Comments should be directed to William Revelle \\ \url{revelle@northwestern.edu}} %\date{} % Activate to display a given date or no date \begin{document} %\SweaveOpts{concordance=TRUE,(prompt=" ",continue=" "} %\SweaveOpts{(prompt=" ",continue=" ") \maketitle \tableofcontents \newpage \subsection{Jump starting the \Rpkg{psych} package--a guide for the impatient} You have installed \Rpkg{psych} (section \ref{sect:starting}) and you want to use it without reading much more. What should you do? \begin{enumerate} \item Activate the \Rpkg{psych} package and the \Rpkg{psychTools} package: \begin{Rinput} library(psych) library(psychTools) \end{Rinput} \item Input your data (section \ref{sect:read}). There are two ways to do this: \begin{itemize} \item Find and read standard files using \pfun{read.file}. This will open a search window for your operating system which you can use to find the file. If the file has a suffix of .text, .txt, .TXT, .csv, ,dat, .data, .sav, .xpt, .XPT, .r, .R, .rds, .Rds, .rda, .Rda, .rdata, Rdata, or .RData, then the file will be opened and the data will be read in (or loaded in the case of Rda files) \begin{scriptsize} %%\begin{Schunk} \begin{Sinput} myData <- read.file() # find the appropriate file using # your normal operating system \end{Sinput} %%\end{Schunk} \end{scriptsize} \item Alternatively, go to your friendly text editor or data manipulation program (e.g., Excel) and copy the data to the clipboard. Include a first line that has the variable labels. Paste it into \Rpkg{psych} using the \pfun{read.clipboard.tab} command: \begin{scriptsize} %%\begin{Schunk} \begin{Sinput} myData <- read.clipboard.tab() # if on the clipboard \end{Sinput} %\end{Schunk} \end{scriptsize} Note that there are number of options for \pfun{read.clipboard} for reading in Excel based files, lower triangular files, etc. \end{itemize} \item Make sure that what you just read is right. Describe it (section~\ref{sect:describe}) and perhaps look at the first and last few lines. If you have multiple groups, try \pfun{describeBy}. \begin{scriptsize} %%\begin{Schunk} \begin{Sinput} dim(myData) #What are the dimensions of the data? describe(myData) # or describeBy(myData,groups="mygroups") #for descriptive statistics by groups headTail(myData) #show the first and last n lines of a file \end{Sinput} %\end{Schunk} \end{scriptsize} \item Look at the patterns in the data. If you have fewer than about 12 variables, look at the SPLOM (Scatter Plot Matrix) of the data using \pfun{pairs.panels} (section~\ref{sect:pairs}) Then, use the \pfun{outlier} function to detect outliers. \begin{scriptsize} %%\begin{Schunk} \begin{Sinput} pairs.panels(myData) outlier(myData) \end{Sinput} %\end{Schunk} \end{scriptsize} \item Note that you might have some weird subjects, probably due to data entry errors. Either edit the data by hand (use the \fun{edit} command) or just \pfun{scrub} the data (section \ref{sect:scrub}). \begin{scriptsize} %%\begin{Schunk} \begin{Sinput} cleaned <- scrub(myData, max=9) #e.g., change anything great than 9 to NA \end{Sinput} %\end{Schunk} \end{scriptsize} \item Graph the data with error bars for each variable (section \ref{sect:errorbars}). \begin{scriptsize} %%\begin{Schunk} \begin{Sinput} error.bars(myData) \end{Sinput} %\end{Schunk} \end{scriptsize} \item Find the correlations of all of your data. \pfun{lowerCor} will by default find the pairwise correlations, round them to 2 decimals, and display the lower off diagonal matrix. \begin{itemize} \item Descriptively (just the values) (section \ref{sect:lowerCor}) \begin{scriptsize} %%\begin{Schunk} \begin{Sinput} r <- lowerCor(myData) #The correlation matrix, rounded to 2 decimals \end{Sinput} %\end{Schunk} \end{scriptsize} \item Graphically (section \ref{sect:corplot}). Another way is to show a heat map of the correlations with the correlation values included. \begin{scriptsize} %%\begin{Schunk} \begin{Sinput} corPlot(r) #examine the many options for this function. \end{Sinput} %\end{Schunk} \end{scriptsize} \item Inferentially (the values, the ns, and the p values) (section \ref{sect:corr.test}) \begin{scriptsize} %%\begin{Schunk} \begin{Sinput} corr.test(myData) \end{Sinput} %\end{Schunk} \end{scriptsize} \end{itemize} \item Apply various regression models. Several functions are meant to do multiple regressions, either from the raw data or from a variance/covariance matrix, or a correlation matrix. This is discussed in more detail in the ``How To use \pfun{mediate} and \pfun{lmCor} to do \href{https://personality-project.org/r/psych/HowTo/mediation.pdf}{mediation, moderation and regression analysis} tutorial. \begin{itemize} \item \pfun{lmCor} will take raw data or a correlation matrix and find (and graph the path diagram) for multiple y variables depending upon multiple x variables. If we have the raw data, we can also find the interaction term (x1 * x2). Although we can find the regressions from just a correlation matrix, we can not find the interaction (moderation effect) unless given raw data. \begin{scriptsize} %%\begin{Schunk} \begin{Sinput} myData <- sat.act colnames(myData) <- c("mod1","med1","x1","x2","y1","y2") lmCor(y1 + y2 ~ x1 + x2 + x1*x2, data = myData) \end{Sinput} %\end{Schunk} \end{scriptsize} \item \pfun{mediate} will take raw data or a correlation matrix and find (and graph the path diagram) for multiple y variables depending upon multiple x variables mediated through a mediation variable. It then tests the mediation effect using a boot strap. We specify the mediation variable by enclosing it in parentheses, and show the moderation by the standard multiplication. For the purpose of this demonstration, we do the boot strap with just 50 iterations. The default is 5,000. We use the data from \cite{talor:10} which was downloaded from the supplementary material for Hayes (2013) \href{"https://www.afhayes.com/public/hayes2013data.zip"}{https://www.afhayes.com/public/hayes2013data.zip}. \begin{scriptsize} %%\begin{Schunk} \begin{Sinput} mediate(reaction ~ cond + (import) + (pmi), data =Tal_Or,n.iter=50) \end{Sinput} %\end{Schunk} \end{scriptsize} We can also find the moderation effect by adding in a product term. \item \pfun{mediate} will take raw data and find (and graph the path diagram) a moderated multiple regression model for multiple y variables depending upon multiple x variables mediated through a mediation variable. It then tests the mediation effect using a boot strap. By default, we find the raw regressions and mean center. If we specify zero=FALSE, we do not mean center the data. If we specify std=TRUE, we find the standardized regressions. \begin{scriptsize} %%\begin{Schunk} \begin{Sinput} mediate(respappr ~ prot * sexism +(sexism),data=Garcia,zero=FALSE, n.iter=50, main="Moderated mediation (not mean centered)") \end{Sinput} %\end{Schunk} \end{scriptsize} \end{itemize} \subsection{Psychometric functions are summarized in the second vignette} Many additional functions, particularly designed for basic and advanced psychometrics are discussed more fully in the \emph{Overview Vignette}, which may be downloaded from \url{https://personality-project.org/r/psych/vignettes/overview.pdf} . A brief review of the functions available is included here. In addition, there are helpful tutorials for \emph{Finding omega}, \emph{How to score scales and find reliability}, and for \emph{Using psych for factor analysis} at \url{https://personality-project.org/r}. \begin{itemize} \item Test for the number of factors in your data using parallel analysis (\pfun{fa.parallel}) or Very Simple Structure (\pfun{vss}). Perhaps even easier to use is the \pfun{nfactors} function. \begin{scriptsize} %%\begin{Schunk} \begin{Sinput} fa.parallel(myData) vss(myData) nfactors(myData) \end{Sinput} %\end{Schunk} \end{scriptsize} \item Factor analyze (see section 4.1) the data with a specified number of factors (the default is 1), the default method is minimum residual, the default rotation for more than one factor is oblimin. There are many more possibilities such as minres (section 4.1.1), alpha factoring, and wls. Compare the solution to a hierarchical cluster analysis using the ICLUST algorithm \citep{revelle:iclust} (see section 4.1.6). Also consider a hierarchical factor solution to find coefficient $\omega$). \begin{scriptsize} %%\begin{Schunk} \begin{Sinput} fa(myData) iclust(myData) omega(myData) \end{Sinput} %\end{Schunk} \end{scriptsize} If you prefer to do a principal components analysis you may use the \pfun{principal} function. The default is one component. \begin{scriptsize} %%\begin{Schunk} \begin{Sinput} principal(myData) \end{Sinput} %\end{Schunk} \end{scriptsize} \item Some people like to find coefficient $\alpha$ as an estimate of reliability. This may be done for a single scale using the \pfun{alpha} function. Perhaps more useful is the ability to create several scales as unweighted averages of specified items using the \pfun{scoreItems} function and to find various estimates of internal consistency for these scales, find their intercorrelations, and find scores for all the subjects. \begin{scriptsize} %%\begin{Schunk} \begin{Sinput} alpha(myData) #score all of the items as part of one scale. myKeys <- make.keys(nvar=20,list(first = c(1,-3,5,-7,8:10), second=c(2,4,-6,11:15,-16))) my.scores <- scoreItems(myKeys,myData) #form several scales my.scores #show the highlights of the results \end{Sinput} %\end{Schunk} \end{scriptsize} \end{itemize} \end{enumerate} At this point you have had a chance to see the highlights of the \Rpkg{psych} package and to do some basic (and advanced) data analysis. You might find reading this entire vignette as well as the Overview Vignette to be helpful to get a broader understanding of what can be done in \R{} using the \Rpkg{psych}. Remember that the help command (?) is available for every function. Try running the examples for each help page. \newpage \section{Overview of this and related documents} The \Rpkg{psych} package \citep{psych} has been developed at Northwestern University since 2005 to include functions most useful for personality, psychometric, and psychological research. The package is also meant to supplement a text on psychometric theory \citep{revelle:intro}, a draft of which is available at \url{https://personality-project.org/r/book/}. Some of the functions (e.g., \pfun{read.file}, \pfun{read.clipboard}, \pfun{describe}, \pfun{pairs.panels}, \pfun{scatter.hist}, \pfun{error.bars}, \pfun{multi.hist}, \pfun{bi.bars}) are useful for basic data entry and descriptive analyses. Psychometric applications emphasize techniques for dimension reduction including factor analysis, cluster analysis, and principal components analysis. The \pfun{fa} function includes six methods of \iemph{factor analysis} (\iemph{minimum residual}, \iemph{principal axis}, \iemph{alpha factoring}, \iemph{weighted least squares}, \iemph{generalized least squares} and \iemph{maximum likelihood} factor analysis). Principal Components Analysis (PCA) is also available through the use of the \pfun{principal} or \pfun{pca} functions. Rotations and transformations of these solutions are done by calling the many rotations available in the \Rpkg{GPArotation} package \citep{gpa.rotate}. Determining the number of factors or components to extract may be done by using the Very Simple Structure \citep{revelle:vss} (\pfun{vss}), Minimum Average Partial correlation \citep{velicer:76} (\pfun{MAP}) or parallel analysis (\pfun{fa.parallel}) criteria. These and several other criteria are included in the \pfun{nfactors} function. Two parameter Item Response Theory (IRT) models for dichotomous or polytomous items may be found by factoring \pfun{tetrachoric} or \pfun{polychoric} correlation matrices and expressing the resulting parameters in terms of location and discrimination using \pfun{irt.fa}. Bifactor and hierarchical factor structures may be estimated by using Schmid Leiman transformations \citep{schmid:57} (\pfun{schmid}) to transform a hierarchical factor structure into a \iemph{bifactor} solution \citep{holzinger:37}. Higher order models can also be found using \pfun{fa.multi}. Scale construction can be done using the Item Cluster Analysis \citep{revelle:iclust} (\pfun{iclust}) function to determine the structure and to calculate reliability coefficients $\alpha$ \citep{cronbach:51} (\pfun{alpha}, \pfun{scoreItems}, \pfun{score.multiple.choice}), $\beta$ \citep{revelle:iclust,rz:09} (\pfun{iclust}) and McDonald's $\omega_h$ and $\omega_t$ \citep{mcdonald:tt} (\pfun{omega}). Guttman's six estimates of internal consistency reliability (\cite{guttman:45}, as well as additional estimates \citep{rz:09}, \citep{rc:pa} are in the \pfun{guttman} function. The six measures of Intraclass correlation coefficients (\pfun{ICC}) discussed by \cite{shrout:79} are also available. For data with a a multilevel structure (e.g., items within subjects across time, or items within subjects across groups), the \pfun{describeBy}, \pfun{statsBy} functions will give basic descriptives by group. \pfun{StatsBy} also will find within group (or subject) correlations as well as the between group correlation. \pfun{multilevel.reliability} (\pfun{mlr}) will find various generalizability statistics for subjects over time and items. \pfun{mlPlot} will graph items over for each subject, \pfun{mlArrange} converts wide data frames to long data frames suitable for multilevel modeling. Graphical displays include Scatter Plot Matrix (SPLOM) plots using \pfun{pairs.panels}, correlation ``heat maps'' (\pfun{corPlot}) factor, cluster, and structural diagrams using \pfun{fa.diagram}, \pfun{iclust.diagram}, \pfun{structure.diagram} and \pfun{het.diagram}, as well as item response characteristics and item and test information characteristic curves \pfun{plot.irt} and \pfun{plot.poly}. This vignette is meant to give an overview of the \Rpkg{psych} package. That is, it is meant to give a summary of the main functions in the \Rpkg{psych} package with examples of how they are used for data description, dimension reduction, and scale construction. The extended user manual at \href{"https://personality-project.org/r/psych_manual.pdf"}{\url{psych.manual.pdf}} includes examples of graphic output and more extensive demonstrations than are found in the help menus. (Also available at \url{https://personality-project.org/r/psych_manual.pdf}). The vignette, psych for sem, at \href{https://personalty-project.org/r/psych_for_sem.pdf}{\url{https://personalty-project.org/r/psych_for_sem.pdf}}, discusses how to use psych as a front end to the \Rpkg{sem} package of John Fox \citep{sem}. (The vignette is also available at \href{"https://personality-project.org/r/book/psych_for_sem.pdf"}{\url{https://personality-project.org/r/psych/vignettes/psych_for_sem.pdf}}). In addition, there are a growing number of ``HowTo"s at the personality project. Currently these include: \begin{enumerate} \item An \href{https://personality-project.org/r/psych/intro.pdf}{introduction} (vignette) of the \Rpkg{psych} package \item An \href{https://personality-project.org/r/psych/overview.pdf}{overview} (vignette) of the \Rpkg{psych} package \item \href{https://personality-project.org/r/psych/HowTo/getting_started.pdf}{Installing} \R{} and some useful packages \item Using \R{} and the \Rpkg{psych} package to find \href{https://personality-project.org/r/psych/HowTo/omega.pdf}{$omega_h$} and $\omega_t$. \item Using \R{} and the \Rpkg{psych} for \href{https://personality-project.org/r/psych/HowTo/factor.pdf}{factor analysis} and principal components analysis. \item Using the \pfun{scoreItems} function to find \href{https://personality-project.org/r/psych/HowTo/scoring.pdf}{scale scores and scale statistics}. \item Using \pfun{mediate} and \pfun{lmCor} to do \href{https://personality-project.org/r/psych/HowTo/mediation.pdf}{mediation, moderation and regression analysis}. \end{enumerate} For a step by step tutorial in the use of the psych package and the base functions in R for basic personality research, see the guide for using \R{} for personality research at \url{https://personalitytheory.org/r/r.short.html}. For an \iemph{introduction to psychometric theory with applications in \R{}}, see the draft chapters at \url{https://personality-project.org/r/book}). \section{Getting started} \label{sect:starting} Some of the functions described in the Overview Vignette require other packages. This is not the case for the functions listed in this Introduction. Particularly useful for rotating the results of factor analyses (from e.g., \pfun{fa}, \pfun{factor.minres}, \pfun{factor.pa}, \pfun{factor.wls}, or \pfun {principal}) or hierarchical factor models using \pfun{omega} or \pfun{schmid}, is the \Rpkg{GPArotation} package. These and other useful packages may be installed by first installing and then using the task views (\Rpkg{ctv}) package to install the ``Psychometrics" task view, but doing it this way is not necessary. The ``Psychometrics'' task view will install a large number of useful packages. To install the bare minimum for the examples in this vignette, it is necessary to install just 3 packages: %\begin{Schunk} \begin{Sinput} install.packages(list(c("GPArotation","mnormt") \end{Sinput} %%\end{Schunk} Alternatively, many packages for psychometric can be downloaded at once using the ``Psychometrics" task view: %\begin{Schunk} \begin{Sinput} install.packages("ctv") library(ctv) task.views("Psychometrics") \end{Sinput} %\end{Schunk} Because of the difficulty of installing the package \Rpkg{Rgraphviz}, alternative graphics have been developed and are available as \iemph{diagram} functions. If \Rpkg{Rgraphviz} is available, some functions will take advantage of it. An alternative is to use ``dot'' output of commands for any external graphics package that uses the dot language. \section{Basic data analysis} A number of \Rpkg{psych} functions facilitate the entry of data and finding basic descriptive statistics. Remember, to run any of the \Rpkg{psych} functions, it is necessary to make the package active by using the \fun{library} command: %\begin{Schunk} \begin{Sinput} library(psych) library(psychTools) \end{Sinput} %\end{Schunk} The other packages, once installed, will be called automatically by \Rpkg{psych}. It is possible to automatically load \Rpkg{psych} and other functions by creating and then saving a ``.First" function: e.g., %\begin{Schunk} \begin{Sinput} .First <- function(x) {library(psych) library(psychTools)} \end{Sinput} %\end{Schunk} \subsection{Getting the data by using read.file} \label{sect:read} Although many find copying the data to the clipboard and then using the \pfun{read.clipboard} functions (see below), a helpful alternative is to read the data in directly. This can be done using the \pfun{read.file} function which calls \fun{file.choose} to find the file and then based upon the suffix of the file, chooses the appropriate way to read it. For files with suffixes of .text, .txt, .TXT, .csv, ,dat, .data, .sav, .xpt, .XPT, .r, .R, .rds, .Rds, .rda, .Rda, .rdata, Rdata, or .RData, the file will be read correctly. %\begin{Schunk} \begin{Sinput} my.data <- read.file() \end{Sinput} %\end{Schunk} If the file contains Fixed Width Format (fwf) data, the column information can be specified with the widths command. %\begin{Schunk} \begin{Sinput} my.data <- read.file(widths = c(4,rep(1,35)) #will read in a file without a header row # and 36 fields, the first of which is 4 colums, the rest of which are 1 column each. \end{Sinput} %\end{Schunk} If the file is a .RData file (with suffix of .RData, .Rda, .rda, .Rdata, or .rdata) the object will be loaded. Depending what was stored, this might be several objects. If the file is a .sav file from SPSS, it will be read with the most useful default options (converting the file to a data.frame and converting character fields to numeric). Alternative options may be specified. If it is an export file from SAS (.xpt or .XPT) it will be read. .csv files (comma separated files), normal .txt or .text files, .data, or .dat files will be read as well. These are assumed to have a header row of variable labels (header=TRUE). If the data do not have a header row, you must specify read.file(header=FALSE). To read SPSS files and to keep the value labels, specify use.value.labels=TRUE. %\begin{Schunk} \begin{Sinput} #this will keep the value labels for .sav files my.spss <- read.file(use.value.labels=TRUE) \end{Sinput} %\end{Schunk} \subsection{Data input from the clipboard} There are of course many ways to enter data into \R. Reading from a local file using \fun{read.table} is perhaps the most preferred. However, many users will enter their data in a text editor or spreadsheet program and then want to copy and paste into \R{}. This may be done by using \fun{read.table} and specifying the input file as ``clipboard" (PCs) or ``pipe(pbpaste)" (Macs). Alternatively, the \pfun{read.clipboard} set of functions are perhaps more user friendly: \begin{description} \item [\pfun{read.clipboard}] is the base function for reading data from the clipboard. \item [\pfun{read.clipboard.csv}] for reading text that is comma delimited. \item [\pfun{read.clipboard.tab}] for reading text that is tab delimited (e.g., copied directly from an Excel file). \item [\pfun{read.clipboard.lower}] for reading input of a lower triangular matrix with or without a diagonal. The resulting object is a square matrix. \item [\pfun{read.clipboard.upper}] for reading input of an upper triangular matrix. \item[\pfun{read.clipboard.fwf}] for reading in fixed width fields (some very old data sets) \end{description} For example, given a data set copied to the clipboard from a spreadsheet, just enter the command %\begin{Schunk} \begin{Sinput} my.data <- read.clipboard() \end{Sinput} %\end{Schunk} This will work if every data field has a value and even missing data are given some values (e.g., NA or -999). If the data were entered in a spreadsheet and the missing values were just empty cells, then the data should be read in as a tab delimited or by using the \pfun{read.clipboard.tab} function. %\begin{Schunk} \begin{Sinput} > my.data <- read.clipboard(sep="\t") #define the tab option, or > my.tab.data <- read.clipboard.tab() #just use the alternative function \end{Sinput} %\end{Schunk} For the case of data in fixed width fields (some old data sets tend to have this format), copy to the clipboard and then specify the width of each field (in the example below, the first variable is 5 columns, the second is 2 columns, the next 5 are 1 column the last 4 are 3 columns). %\begin{Schunk} \begin{Sinput} > my.data <- read.clipboard.fwf(widths=c(5,2,rep(1,5),rep(3,4)) \end{Sinput} %\end{Schunk} \subsection{Basic descriptive statistics} \label{sect:describe} Once the data are read in, then \pfun{describe} or \pfun{describeBy} will provide basic descriptive statistics arranged in a data frame format. Consider the data set \pfun{sat.act} which includes data from 700 web based participants on 3 demographic variables and 3 ability measures. \begin{description} \item[\pfun{describe}] reports means, standard deviations, medians, min, max, range, skew, kurtosis and standard errors for integer or real data. Non-numeric data, although the statistics are meaningless, will be treated as if numeric (based upon the categorical coding of the data), and will be flagged with an *. \item[\pfun{describeBy}] reports descriptive statistics broken down by some categorizing variable (e.g., gender, age, etc.) \end{description} <>= options(width=100) @ \begin{scriptsize} <>= library(psych) #need to make psych active the first time you call it library(psychTools) #additional tools and data are here data(sat.act) describe(sat.act) #basic descriptive statistics @ \end{scriptsize} These data may then be analyzed by groups defined in a logical statement or by some other variable. E.g., break down the descriptive data for males or females. These descriptive data can also be seen graphically using the \pfun{error.bars.by} function (Figure~\ref{fig:error.bars}). By setting skew=FALSE and ranges=FALSE, the output is limited to the most basic statistics. Here we use formula mode. \begin{scriptsize} <>= #basic descriptive statistics by a grouping variable. describeBy(sat.act ~ gender,skew=FALSE,ranges=FALSE) @ \end{scriptsize} The output from the \pfun{describeBy} function can be forced into a matrix form for easy analysis by other programs. In addition, describeBy can group by several grouping variables at the same time. \begin{scriptsize} <>= sa.mat <- describeBy(sat.act ~ gender + education, skew=FALSE,ranges=FALSE,mat=TRUE) headTail(sa.mat) @ \end{scriptsize} If some of the data are in character mode, \pfun{describe} and \pfun{describeBy} will automatically call \pfun{char2numeric} which will convert all fields to numeric. Note that this will cause problems if the character order is not meaningful. In that case, the \pfun{recode} function should be used to make the \pfun{char2numeric} coding make sense. \subsubsection{Outlier detection using \pfun{outlier}} One way to detect unusual data is to consider how far each data point is from the multivariate centroid of the data. That is, find the squared Mahalanobis distance for each data point and then compare these to the expected values of $\chi^{2}$. This produces a Q-Q (quantle-quantile) plot with the n most extreme data points labeled (Figure~\ref{fig:outlier}). The outlier values are in the vector d2. \begin{figure}[htbp] \begin{center} \begin{scriptsize} <>= png( 'outlier.png' ) d2 <- outlier(sat.act,cex=.8) dev.off() @ \end{scriptsize} \includegraphics{outlier} \caption{Using the \pfun{outlier} function to graphically show outliers. The y axis is the Mahalanobis $D^{2}$, the X axis is the distribution of $\chi^{2}$ for the same number of degrees of freedom. The outliers detected here may be shown graphically using \pfun{pairs.panels} (see \ref{fig:pairs.panels}, and may be found by sorting d2. } \label{fig:outlier} \end{center} \end{figure} \subsubsection{Basic data cleaning using \pfun{scrub}} \label{sect:scrub} If, after describing the data it is apparent that there were data entry errors that need to be globally replaced with NA, or only certain ranges of data will be analyzed, the data can be ``cleaned" using the \pfun{scrub} function. Consider a data set of 10 rows of 12 columns with values from 1 - 120. All values of columns 3 - 5 that are less than 30, 40, or 50 respectively, or greater than 70 in any of the three columns will be replaced with NA. In addition, any value exactly equal to 45 will be set to NA. (max and isvalue are set to one value here, but they could be a different value for every column). \begin{scriptsize} <>= x <- matrix(1:120,ncol=10,byrow=TRUE) colnames(x) <- paste('V',1:10,sep='') new.x <- scrub(x,3:5,min=c(30,40,50),max=70,isvalue=45,newvalue=NA) new.x @ \end{scriptsize} Note that the number of subjects for those columns has decreased, and the minimums have gone up but the maximums down. Data cleaning and examination for outliers should be a routine part of any data analysis. \subsubsection{Recoding categorical variables into dummy coded variables} Sometimes categorical variables (e.g., college major, occupation, ethnicity) are to be analyzed using correlation or regression. To do this, one can form ``dummy codes'' which are merely binary variables for each category. This may be done using \pfun{dummy.code}. Subsequent analyses using these dummy coded variables may be using \pfun{biserial} or point biserial (regular Pearson r) to show effect sizes and may be plotted in e.g., \pfun{spider} plots. Alternatively, sometimes data were coded originally as categorical (Male/Female, High School, some College, in college, etc.) and you want to convert these columns of data to numeric. This is done by \pfun{char2numeric}. Values can be recoded into a different order, or converted to character form by using the \pfun{recode} function, \subsubsection{Joining data sets using \pfun{vJoin}} The \pfun{vJoin} function can be used to combine sets that might or might not have overlapping subjects or overlapping items. Unlike \pfun{cbind} or \pfun{rbind} which require the same number of rows ( \pfun{cbind}) or columns (\pfun{rbind}), \pfun{vJoin} will work with any two matrices or data frames of arbitrary dimensions. Not matching cases are assigned values of NA. \begin{scriptsize} <>= x <- matrix(1:40,ncol=10,byrow=TRUE) y <- matrix(1:20,ncol=4) xy <- vJoin(x,y) xy XY <- vJoin(x,y,cnames=FALSE) XY #match on ids and columns x <- bfi[1:5,1:10] y <- bfi[3:8,2:6] xy <- vJoin(x,y) xy #the merged data @ \end{scriptsize} \subsection{Simple descriptive graphics} Graphic descriptions of data are very helpful both for understanding the data as well as communicating important results. Scatter Plot Matrices (SPLOMS) using the \pfun{pairs.panels} function are useful ways to look for strange effects involving outliers and non-linearities. \pfun{error.bars.by} will show group means with 95\% confidence boundaries. By default, \pfun{error.bars.by} and \pfun{error.bars} will show ``cats eyes'' to graphically show the confidence limits (Figure~\ref{fig:error.bars}) This may be turned off by specifying eyes=FALSE. \pfun{densityBy} or \pfun{violinBy} may be used to show the distribution of the data in ``violin'' plots (Figure~\ref{fig:violin}). (These are sometimes called ``lava-lamp" plots.) \subsubsection{Scatter Plot Matrices} Scatter Plot Matrices (SPLOMS) are very useful for describing the data. The \pfun{pairs.panels} function, adapted from the help menu for the \fun{pairs} function produces xy scatter plots of each pair of variables below the diagonal, shows the histogram of each variable on the diagonal, and shows the \iemph{lowess} locally fit regression line as well. An ellipse around the mean with the axis length reflecting one standard deviation of the x and y variables is also drawn. The x axis in each scatter plot represents the column variable, the y axis the row variable (Figure~\ref{fig:pairs.panels}). When plotting many subjects, it is both faster and cleaner to set the plot character (pch) to be '.'. (See Figure~\ref{fig:pairs.panels} for an example.) \begin{description} \label{sect:pairs} \item[\pfun{pairs.panels} ] will show the pairwise scatter plots of all the variables as well as histograms, locally smoothed regressions, and the Pearson correlation. When plotting many data points (as in the case of the sat.act data, it is possible to specify that the plot character is a period to get a somewhat cleaner graphic. However, in this figure, to show the outliers, we use colors and a larger plot character. If we want to indicate 'significance' of the correlations by the conventional use of 'magic astricks' we can set the \pfun{stars}=TRUE option. \end{description} \begin{figure}[htbp] \begin{center} \begin{scriptsize} <>= png( 'pairspanels.png' ) sat.d2 <- data.frame(sat.act,d2) #combine the d2 statistics from before with the sat.act data.frame pairs.panels(sat.d2,bg=c("yellow","blue")[(d2 > 25)+1],pch=21,stars=TRUE) dev.off() @ \end{scriptsize} \includegraphics{pairspanels} \caption{Using the \pfun{pairs.panels} function to graphically show relationships. The x axis in each scatter plot represents the column variable, the y axis the row variable. Note the extreme outlier for the ACT. If the plot character were set to a period (pch='.') it would make a cleaner graphic, but in to show the outliers in color we use the plot characters 21 and 22. } \label{fig:pairs.panels} \end{center} \end{figure} Another example of \pfun{pairs.panels} is to show differences between experimental groups. Consider the data in the \pfun{affect} data set. The scores reflect post test scores on positive and negative affect and energetic and tense arousal. The colors show the results for four movie conditions: depressing, frightening movie, neutral, and a comedy. \begin{figure}[htbp] \begin{center} \begin{scriptsize} <>= png('affect.png') pairs.panels(affect[14:17],bg=c("red","black","white","blue")[affect$Film],pch=21, main="Affect varies by movies ") dev.off() @ \end{scriptsize} \includegraphics{affect} \caption{Using the \pfun{pairs.panels} function to graphically show relationships. The x axis in each scatter plot represents the column variable, the y axis the row variable. The coloring represent four different movie conditions. } \label{fig:pairs.panels2} \end{center} \end{figure} Yet another demonstration of \pfun{pairs.panels} is useful when you have many subjects and want to show the density of the distributions. To do this we will use the \pfun{make.keys} and \pfun{scoreItems} functions (discussed in the second vignette) to create scales measuring Energetic Arousal, Tense Arousal, Positive Affect, and Negative Affect (see the \pfun{msq} help file). We then show a \pfun{pairs.panels} scatter plot matrix where we smooth the data points and show the density of the distribution by color. %\begin{figure}[htbp] %\begin{center} \begin{scriptsize} <>= keys <- list( EA = c("active", "energetic", "vigorous", "wakeful", "wide.awake", "full.of.pep", "lively", "-sleepy", "-tired", "-drowsy"), TA =c("intense", "jittery", "fearful", "tense", "clutched.up", "-quiet", "-still", "-placid", "-calm", "-at.rest") , PA =c("active", "excited", "strong", "inspired", "determined", "attentive", "interested", "enthusiastic", "proud", "alert"), NAf =c("jittery", "nervous", "scared", "afraid", "guilty", "ashamed", "distressed", "upset", "hostile", "irritable" )) scores <- scoreItems(keys,psychTools::msq[,1:75]) #png('msq.png') # pairs.panels(scores$scores,smoother=TRUE, # main ="Density distributions of four measures of affect" ) #dev.off() @ \end{scriptsize} %\includegraphics{msq} Using the \pfun{pairs.panels} function to graphically show relationships. (Not shown in the interests of space.) The x axis in each scatter plot represents the column variable, the y axis the row variable. The variables are four measures of motivational state for 3896 participants. Each scale is the average score of 10 items measuring motivational state. Compare this a plot with smoother set to FALSE. %\label{fig:pairs.panels3} %\end{center} %\end{figure} \subsubsection{Density or violin plots} Graphical presentation of data may be shown using box plots to show the median and 25th and 75th percentiles. A powerful alternative is to show the density distribution using the \pfun{violinBy} function (Figure~\ref{fig:violin}) or the more conventional density plot for multiple groups (Figure~\ref{fig:histo} . \begin{figure}[htbp] \begin{center} \begin{scriptsize} <>= png('violin.png') data(sat.act) violinBy(SATV+SATQ ~ gender, data=sat.act,grp.name=cs(Verbal.M,Verbal.F, Quan.M,Quant.F), main="Density Plot by gender for SAT V and Q") dev.off() @ \end{scriptsize} \includegraphics{violin} \caption{Using the \pfun{violinBy} function to show the distribution of SAT V and Q for males and females. The plot shows the medians, and 25th and 75th percentiles, as well as the entire range and the density distribution. } \label{fig:violin} \end{center} \end{figure} \clearpage \subsubsection{Means and error bars} \label{sect:errorbars} Additional descriptive graphics include the ability to draw \iemph{error bars} on sets of data, as well as to draw error bars in both the x and y directions for paired data. These are the functions \pfun{error.bars}, \pfun{error.bars.by}, \pfun{error.bars.tab}, and \pfun{error.crosses}. \begin{description} \item [\pfun{error.bars}] show the 95 \% confidence intervals for each variable in a data frame or matrix. These errors are based upon normal theory and the standard errors of the mean. Alternative options include +/- one standard deviation or 1 standard error. If the data are repeated measures, the error bars will be reflect the between variable correlations. By default, the confidence intervals are displayed using a ``cats eyes'' plot which emphasizes the distribution of confidence within the confidence interval. \item [\pfun{error.bars.by}] does the same, but grouping the data by some condition. \item [\pfun{error.bars.tab}] draws bar graphs from tabular data with error bars based upon the standard error of proportion ($\sigma_{p} = \sqrt{pq/N} $) \item [\pfun{error.crosses}] draw the confidence intervals for an x set and a y set of the same size. \end{description} The use of the \pfun{error.bars.by} function allows for graphic comparisons of different groups (see Figure~\ref{fig:error.bars}). Five personality measures are shown as a function of high versus low scores on a ``lie" scale. People with higher lie scores tend to report being more agreeable, conscientious and less neurotic than people with lower lie scores. The error bars are based upon normal theory and thus are symmetric rather than reflect any skewing in the data. \begin{figure}[htbp] \begin{center} \begin{scriptsize} <>= data(epi.bfi) error.bars.by(epi.bfi[,6:10],epi.bfi$epilie<4) @ \end{scriptsize} \caption{Using the \pfun{error.bars.by} function shows that self reported personality scales on the Big Five Inventory vary as a function of the Lie scale on the EPI. The ``cats eyes'' show the distribution of the confidence. } \label{fig:error.bars} \end{center} \end{figure} Although not recommended, it is possible to use the \pfun{error.bars} function to draw bar graphs with associated error bars. (This kind of \iemph{dynamite plot} (Figure~\ref{fig:dynamite}) can be very misleading in that the scale is arbitrary. Go to a discussion of the problems in presenting data this way at \url{https://emdbolker.wikidot.com/blog:dynamite}. In the example shown, note that the graph starts at 0, although is out of the range. This is a function of using bars, which always are assumed to start at zero. Consider other ways of showing your data. \begin{figure}[htbp] \begin{center} \begin{scriptsize} <>= error.bars.by(sat.act[5:6],sat.act$gender,bars=TRUE, labels=c("Male","Female"),ylab="SAT score",xlab="") @ \end{scriptsize} \caption{A ``Dynamite plot" of SAT scores as a function of gender is one way of misleading the reader. By using a bar graph, the range of scores is ignored. Bar graphs start from 0. } \label{fig:dynamite} \end{center} \end{figure} \subsubsection{Error bars for tabular data} However, it is sometimes useful to show error bars for tabular data, either found by the \fun{table} function or just directly input. These may be found using the \pfun{error.bars.tab} function. \begin{figure}[htbp] \begin{center} \begin{scriptsize} <>= T <- with(sat.act,table(gender,education)) rownames(T) <- c("M","F") error.bars.tab(T,way="both",ylab="Proportion of Education Level",xlab="Level of Education", main="Proportion of sample by education level") @ \end{scriptsize} \caption{The proportion of each education level that is Male or Female. By using the way="both" option, the percentages and errors are based upon the grand total. Alternatively, way="columns" finds column wise percentages, way="rows" finds rowwise percentages. The data can be converted to percentages (as shown) or by total count (raw=TRUE). The function invisibly returns the probabilities and standard errors. See the help menu for an example of entering the data as a data.frame. } \label{fig:dynamite} \end{center} \end{figure} \clearpage \subsubsection{Two dimensional displays of means and errors} Yet another way to display data for different conditions is to use the \pfun{errorCrosses} function. For instance, the effect of various movies on both ``Energetic Arousal'' and ``Tense Arousal'' can be seen in one graph and compared to the same movie manipulations on ``Positive Affect'' and ``Negative Affect''. Note how Energetic Arousal is increased by three of the movie manipulations, but that Positive Affect increases following the Happy movie only. \begin{figure}[htbp] \begin{center} \begin{scriptsize} <>= op <- par(mfrow=c(1,2)) data(affect) colors <- c("black","red","white","blue") films <- c("Sad","Horror","Neutral","Happy") affect.stats <- errorCircles("EA2","TA2",data=affect[-c(1,20)],group="Film",labels=films, xlab="Energetic Arousal", ylab="Tense Arousal",ylim=c(10,22),xlim=c(8,20),pch=16, cex=2,colors=colors, main =' Movies effect on arousal') errorCircles("PA2","NA2",data=affect.stats,labels=films,xlab="Positive Affect", ylab="Negative Affect", pch=16,cex=2,colors=colors, main ="Movies effect on affect") op <- par(mfrow=c(1,1)) @ \end{scriptsize} \caption{The use of the \pfun{errorCircles} function allows for two dimensional displays of means and error bars. The first call to \pfun{errorCircles} finds descriptive statistics for the \iemph{affect} data.frame based upon the grouping variable of Film. These data are returned and then used by the second call which examines the effect of the same grouping variable upon different measures. The size of the circles represent the relative sample sizes for each group. The data are from the PMC lab and reported in \cite{smillie:jpsp}.} \label{fig:errorCircles} \end{center} \end{figure} \clearpage \subsubsection{Back to back histograms} The \pfun{bi.bars} function summarize the characteristics of two groups (e.g., males and females) on a second variable (e.g., age) by drawing back to back histograms (see Figure~\ref{fig:bibars}). \begin{figure}[!ht] \begin{center} \begin{scriptsize} % <>= <>= data(bfi) png( 'bibars.png' ) bi.bars(bfi,"age","gender",ylab="Age",main="Age by males and females") dev.off() @ \end{scriptsize} \includegraphics{bibars.png} \caption{A bar plot of the age distribution for males and females shows the use of \pfun{bi.bars}. The data are males and females from 2800 cases collected using the \iemph{SAPA} procedure and are available as part of the \pfun{bfi} data set. An alternative way of displaying these data is in the \pfun{densityBy} in the next figure.} \label{fig:bibars} \end{center} \end{figure} \begin{figure}[!ht] \begin{center} \begin{scriptsize} <>= png('histo.png') densityBy(bfi,"age",grp="gender") dev.off() @ \end{scriptsize} \includegraphics{histo} \caption{Using the \pfun{densitynBy} function to show the age distribution for males and females. The plot is a conventional density diagram for two two groups. Compare this to the \pfun{bi.bars} plot in the previous figure. By plotting densities, we can see that the males are slightly over represented in the younger ranges.} \label{fig:histo} \end{center} \end{figure} \clearpage \subsubsection{ScatterPlot Histograms} The \pfun{scatterHist} function shows scatter plots for two variables and includes histograms by a grouping variable (see Figure~\ref{fig:scatterHist}). The data shown in the figure are from \cite{gruber:20} and the results are discussed by \cite{eagly:revelle}. The data are in GERAS data set in the \Rpkg{psychTools} package. \begin{figure}[!ht] \begin{center} \begin{scriptsize} % <>= <>= data(GERAS) png( 'scatterHist.png' ) psych::scatterHist(F ~ M + gender, data=GERAS.scales, cex.point=.3,smooth=FALSE, xlab="Masculine Scale",ylab="Feminine Scale",correl=FALSE, d.arrow=TRUE,col=c("red","blue"), bg=c("red","blue"), lwd=4, title="Combined M and F scales",cex.cor=2,cex.arrow=1.25) dev.off() @ \end{scriptsize} \includegraphics{scatterHist} \caption{A scatter plot with histograms for males and females on a scale developed by \cite{gruber:20} and used in an article by \cite{eagly:revelle}. Two scales (Masculine and Feminine) show univariate sex differences as well as Mahalobinis distances. } \label{fig:scatterHist} \end{center} \end{figure} \clearpage \subsubsection{Correlational structure} \label{sect:lowerCor} There are many ways to display correlations. Tabular displays are probably the most common. The output from the \fun{cor} function in core R is a rectangular matrix. \pfun{lowerMat} will round this to (2) digits and then display as a lower off diagonal matrix. \pfun{lowerCor} calls \fun{cor} with \emph{use=`pairwise', method=`pearson'} as default values and returns (invisibly) the full correlation matrix and displays the lower off diagonal matrix. \begin{scriptsize} <>= lowerCor(sat.act) @ \end{scriptsize} When comparing results from two different groups, it is convenient to display them as one matrix, with the results from one group below the diagonal, and the other group above the diagonal. Use \pfun{lowerUpper} to do this: \begin{scriptsize} <>= female <- subset(sat.act,sat.act$gender==2) male <- subset(sat.act,sat.act$gender==1) lower <- lowerCor(male[-1]) upper <- lowerCor(female[-1]) both <- lowerUpper(lower,upper) round(both,2) @ \end{scriptsize} It is also possible to compare two matrices by taking their differences and displaying one (below the diagonal) and the difference of the second from the first above the diagonal: \begin{scriptsize} <>= diffs <- lowerUpper(lower,upper,diff=TRUE) round(diffs,2) @ \end{scriptsize} \subsubsection{Heatmap displays of correlational structure} \label{sect:corplot} Perhaps a better way to see the structure in a correlation matrix is to display a \emph{heat map} of the correlations. This is just a matrix color coded to represent the magnitude of the correlation. This is useful when considering the number of factors in a data set. Consider the \pfun{Thurstone} data set which has a clear 3 factor solution (Figure~\ref{fig:cor.plot}) or a simulated data set of 24 variables with a circumplex structure (Figure~\ref{fig:cor.plot.circ}). The color coding represents a ``heat map'' of the correlations, with darker shades of red representing stronger negative and darker shades of blue stronger positive correlations. As an option, the value of the correlation can be shown. \begin{figure}[htbp] \begin{center} \begin{scriptsize} <>= png('corplot.png') corPlot(Thurstone,numbers=TRUE,upper=FALSE,diag=FALSE,cex=.7, main="9 cognitive variables from Thurstone") dev.off() @ \end{scriptsize} \includegraphics{corplot.png} \caption{The structure of correlation matrix can be seen more clearly if the variables are grouped by factor and then the correlations are shown by color. By using the 'numbers' option, the values are displayed as well. By default, the complete matrix is shown. Setting upper=FALSE and diag=FALSE shows a cleaner figure. The cex parameter specifies the character size. } \label{fig:cor.plot} \end{center} \end{figure} \begin{figure}[htbp] \begin{center} \begin{scriptsize} <>= png('circplot.png') circ <- sim.circ(24) r.circ <- cor(circ) corPlot(r.circ,main='24 variables in a circumplex') dev.off() @ \end{scriptsize} \includegraphics{circplot.png} \caption{Using the corPlot function to show the correlations in a circumplex. Correlations are highest near the diagonal, diminish to zero further from the diagonal, and the increase again towards the corners of the matrix. Circumplex structures are common in the study of affect. For circumplex structures, it is perhaps useful to show the complete matrix.} \label{fig:cor.plot.circ} \end{center} \end{figure} Yet another way to show structure is to use ``spider'' plots. Particularly if variables are ordered in some meaningful way (e.g., in a circumplex), a spider plot will show this structure easily. This is just a plot of the magnitude of the correlation as a radial line, with length ranging from 0 (for a correlation of -1) to 1 (for a correlation of 1). (See Figure~\ref{fig:cor.plot.spider}). \begin{figure}[htbp] \begin{center} \begin{scriptsize} <>= png('spider.png') op<- par(mfrow=c(2,2)) spider(y=c(1,6,12,18),x=1:24,data=r.circ,fill=TRUE,main="Spider plot of 24 circumplex variables") op <- par(mfrow=c(1,1)) dev.off() @ \end{scriptsize} \includegraphics{spider.png} \caption{A spider plot can show circumplex structure very clearly. Circumplex structures are common in the study of affect.} \label{fig:cor.plot.spider} \end{center} \end{figure} \subsection{Testing correlations} \label{sect:corr.test} Correlations are wonderful descriptive statistics of the data but some people like to test whether these correlations differ from zero, or differ from each other. The \fun{cor.test} function (in the \Rpkg{stats} package) will test the significance of a single correlation, and the \fun{rcorr} function in the \Rpkg{Hmisc} package will do this for many correlations. In the \Rpkg{psych} package, the \pfun{corr.test} function reports the correlation (Pearson, Spearman, or Kendall) between all variables in either one or two data frames or matrices, as well as the number of observations for each case, and the (two-tailed) probability for each correlation. Unfortunately, these probability values have not been corrected for multiple comparisons and so should be taken with a great deal of salt. Thus, in \pfun{corr.test} and \pfun{corr.p} the raw probabilities are reported below the diagonal and the probabilities adjusted for multiple comparisons using (by default) the Holm correction are reported above the diagonal (Table~\ref{tab:corr.test}). (See the \fun{p.adjust} function for a discussion of \cite{holm:79} and other corrections.) \begin{table}[htpb] \caption{The \pfun{corr.test} function reports correlations, cell sizes, and raw and adjusted probability values. \pfun{corr.p} reports the probability values for a correlation matrix. By default, the adjustment used is that of \cite{holm:79}.} \begin{scriptsize} <>= corr.test(sat.act) @ \end{scriptsize} \label{tab:corr.test} \end{table}% Testing the difference between any two correlations can be done using the \pfun{r.test} function. The function actually does four different tests (based upon an article by \cite{steiger:80b}, depending upon the input: 1) For a sample size n, find the t and p value for a single correlation as well as the confidence interval. \begin{scriptsize} <>= r.test(50,.3) @ \end{scriptsize} 2) For sample sizes of n and n2 (n2 = n if not specified) find the z of the difference between the z transformed correlations divided by the standard error of the difference of two z scores. \begin{scriptsize} <>= r.test(30,.4,.6) @ \end{scriptsize} 3) For sample size n, and correlations ra= r12, rb= r23 and r13 specified, test for the difference of two dependent correlations (Steiger case A). \begin{scriptsize} <>= r.test(103,.4,.5,.1) @ \end{scriptsize} 4) For sample size n, test for the difference between two dependent correlations involving different variables. (Steiger case B). \begin{scriptsize} <>= r.test(103,.5,.6,.7,.5,.5,.8) #steiger Case B @ \end{scriptsize} To test whether a matrix of correlations differs from what would be expected if the population correlations were all zero, the function \pfun{cortest} follows \cite{steiger:80b} who pointed out that the sum of the squared elements of a correlation matrix, or the Fisher z score equivalents, is distributed as chi square under the null hypothesis that the values are zero (i.e., elements of the identity matrix). This is particularly useful for examining whether correlations in a single matrix differ from zero or for comparing two matrices. Although obvious, \pfun{cortest} can be used to test whether the \pfun{sat.act} data matrix produces non-zero correlations (it does). This is a much more appropriate test when testing whether a residual matrix differs from zero. \begin{scriptsize} <>= cortest(sat.act) @ \end{scriptsize} \subsection{Polychoric, tetrachoric, polyserial, and biserial correlations} The Pearson correlation of dichotomous data is also known as the $\phi$ coefficient. If the data, e.g., ability items, are thought to represent an underlying continuous although latent variable, the $\phi$ will underestimate the value of the Pearson applied to these latent variables. One solution to this problem is to use the \pfun{tetrachoric} correlation which is based upon the assumption of a bivariate normal distribution that has been cut at certain points. The \pfun{draw.tetra} function demonstrates the process (Figure~\ref{fig:tetra}). This is also shown in terms of dichotomizing the bivariate normal density function using the \pfun{draw.cor} function. A simple generalization of this to the case of the multiple cuts is the \pfun{polychoric} correlation. \begin{figure}[htbp] \begin{center} \begin{scriptsize} <>= <>= png('tetrar.png') draw.tetra() dev.off() @ \end{scriptsize} \includegraphics{tetrar.png} \caption{The tetrachoric correlation estimates what a Pearson correlation would be given a two by two table of observed values assumed to be sampled from a bivariate normal distribution. The $\phi$ correlation is just a Pearson r performed on the observed values.} \label{fig:tetra} \end{center} \end{figure} The tetrachoric correlation estimates what a Pearson correlation would be given a two by two table of observed values assumed to be sampled from a bivariate normal distribution. The $\phi$ correlation is just a Pearson r performed on the observed values. It is found (laboriously) by optimizing the fit of the bivariate normal for various values of the correlation to the observed cell frequencies. In the interests of space, we do not show the next figure but it can be created by \texttt{draw.cor(expand=20,cuts=c(0,0))} Other estimated correlations based upon the assumption of bivariate normality with cut points include the \pfun{biserial} and \pfun{polyserial} correlation. If the data are a mix of continuous, polytomous and dichotomous variables, the \pfun{mixed.cor} function will calculate the appropriate mixture of Pearson, polychoric, tetrachoric, biserial, and polyserial correlations. The correlation matrix resulting from a number of tetrachoric or polychoric correlation matrix sometimes will not be positive semi-definite. This will sometimes happen if the correlation matrix is formed by using pair-wise deletion of cases. The \pfun{cor.smooth} function will adjust the smallest eigen values of the correlation matrix to make them positive, rescale all of them to sum to the number of variables, and produce a ``smoothed'' correlation matrix. An example of this problem is a data set of \pfun{burt} which probably had a typo in the original correlation matrix. Smoothing the matrix corrects this problem. \section{Multilevel modeling} Correlations between individuals who belong to different natural groups (based upon e.g., ethnicity, age, gender, college major, or country) reflect an unknown mixture of the pooled correlation within each group as well as the correlation of the means of these groups. These two correlations are independent and do not allow inferences from one level (the group) to the other level (the individual). When examining data at two levels (e.g., the individual and by some grouping variable), it is useful to find basic descriptive statistics (means, sds, ns per group, within group correlations) as well as between group statistics (over all descriptive statistics, and overall between group correlations). Of particular use is the ability to decompose a matrix of correlations at the individual level into correlations within group and correlations between groups. \subsection{Decomposing data into within and between level correlations using \pfun{statsBy}} There are at least two very powerful packages (\Rpkg{nlme} and \Rpkg{multilevel}) which allow for complex analysis of hierarchical (multilevel) data structures. \pfun{statsBy} is a much simpler function to give some of the basic descriptive statistics for two level models. (\Rpkg{nlme} and \Rpkg{multilevel} allow for statistical inference, but the descriptives of \pfun{statsBy} are useful.) This follows the decomposition of an observed correlation into the pooled correlation within groups (rwg) and the weighted correlation of the means between groups which is discussed by \cite{pedhazur:97} and by \cite{bliese:09} in the multilevel package. \begin{equation} r_{xy} = \eta_{x_{wg}} * \eta_{y_{wg}} * r_{xy_{wg}} + \eta_{x_{bg}} * \eta_{y_{bg}} * r_{xy_{bg} } \end{equation} where $r_{xy} $ is the normal correlation which may be decomposed into a within group and between group correlations $r_{xy_{wg}}$ and $r_{xy_{bg}} $ and $\eta$ (eta) is the correlation of the data with the within group values, or the group means. \subsection{Generating and displaying multilevel data} \pfun{withinBetween} is an example data set of the mixture of within and between group correlations. The within group correlations between 9 variables are set to be 1, 0, and -1 while those between groups are also set to be 1, 0, -1. These two sets of correlations are crossed such that V1, V4, and V7 have within group correlations of 1, as do V2, V5 and V8, and V3, V6 and V9. V1 has a within group correlation of 0 with V2, V5, and V8, and a -1 within group correlation with V3, V6 and V9. V1, V2, and V3 share a between group correlation of 1, as do V4, V5 and V6, and V7, V8 and V9. The first group has a 0 between group correlation with the second and a -1 with the third group. See the help file for \pfun{withinBetween} to display these data. \pfun{sim.multilevel} will generate simulated data with a multilevel structure. The \pfun{statsBy.boot} function will randomize the grouping variable ntrials times and find the statsBy output. This can take a long time and will produce a great deal of output. This output can then be summarized for relevant variables using the \pfun{statsBy.boot.summary} function specifying the variable of interest. Consider the case of the relationship between various tests of ability when the data are grouped by level of education (statsBy(sat.act)) or when affect data are analyzed within and between an affect manipulation (statsBy(affect) ). \subsection{Factor analysis by groups} Confirmatory factor analysis comparing the structures in multiple groups can be done in the \Rpkg{lavaan} package. However, for exploratory analyses of the structure within each of multiple groups, the \pfun{faBy} function may be used in combination with the \pfun{statsBy} function. First run pfun{statsBy} with the correlation option set to TRUE, and then run \pfun{faBy} on the resulting output. \begin{scriptsize} %\begin{Schunk} \begin{Sinput} sb <- statsBy(bfi[c(1:25,27)], group="education",cors=TRUE) faBy(sb,nfactors=5) #find the 5 factor solution for each education level \end{Sinput} %\end{Schunk} \end{scriptsize} \section{ Multiple Regression, mediation, moderation, and set correlations} The typical application of the \fun{lm} function is to do a linear model of one Y variable as a function of multiple X variables. Because \fun{lm} is designed to analyze complex interactions, it requires raw data as input. It is, however, sometimes convenient to do \iemph{multiple regression} from a correlation or covariance matrix. This is done using the \pfun{lmCor} which will work with either raw data, covariance matrices, or correlation matrices. \subsection{Multiple regression from data or correlation matrices} The \pfun{lmCor} function will take a set of y variables predicted from a set of x variables, perhaps with a set of z covariates removed from both x and y. Consider the \iemph{Thurstone} correlation matrix and find the multiple correlation of the last five variables as a function of the first 4. \begin{scriptsize} <>= lmCor(y = 5:9,x=1:4,data=Thurstone) @ \end{scriptsize} By specifying the number of subjects in correlation matrix, appropriate estimates of standard errors, t-values, and probabilities are also found. The next example finds the regressions with variables 1 and 2 used as covariates. The $\hat{\beta}$ weights for variables 3 and 4 do not change, but the multiple correlation is much less. It also shows how to find the residual correlations between variables 5-9 with variables 1-4 removed. \begin{scriptsize} <>= sc <- lmCor(y = 5:9,x=3:4,data=Thurstone,z=1:2) round(sc$residual,2) @ \end{scriptsize} \subsection{Mediation and Moderation analysis} Although multiple regression is a straightforward method for determining the effect of multiple predictors ($x_{1, 2, ... i}$) on a criterion variable, y, some prefer to think of the effect of one predictor, x, as mediated by another variable, m \citep{preacher:04}. Thus, we we may find the indirect path from x to m, and then from m to y as well as the direct path from x to y. Call these paths a, b, and c, respectively. Then the indirect effect of x on y through m is just ab and the direct effect is c. Statistical tests of the ab effect are best done by bootstrapping. This is discussed in detail in the ``How To use \pfun{mediate} and \pfun{lmCor} to do \href{https://personality-project.org/r/psych/HowTo/mediation.pdf}{mediation, moderation and regression analysis} tutorial. Consider the example from \cite{preacher:04} as analyzed using the \pfun{mediate} function and the subsequent graphic from \pfun{mediate.diagram}. The data are found in the example for \pfun{mediate}. \begin{scriptsize} <>= #data from Preacher and Hayes (2004) sobel <- structure(list(SATIS = c(-0.59, 1.3, 0.02, 0.01, 0.79, -0.35, -0.03, 1.75, -0.8, -1.2, -1.27, 0.7, -1.59, 0.68, -0.39, 1.33, -1.59, 1.34, 0.1, 0.05, 0.66, 0.56, 0.85, 0.88, 0.14, -0.72, 0.84, -1.13, -0.13, 0.2), THERAPY = structure(c(0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0), value.labels = structure(c(1, 0), .Names = c("cognitive", "standard"))), ATTRIB = c(-1.17, 0.04, 0.58, -0.23, 0.62, -0.26, -0.28, 0.52, 0.34, -0.09, -1.09, 1.05, -1.84, -0.95, 0.15, 0.07, -0.1, 2.35, 0.75, 0.49, 0.67, 1.21, 0.31, 1.97, -0.94, 0.11, -0.54, -0.23, 0.05, -1.07)), .Names = c("SATIS", "THERAPY", "ATTRIB" ), row.names = c(NA, -30L), class = "data.frame", variable.labels = structure(c("Satisfaction", "Therapy", "Attributional Positivity"), .Names = c("SATIS", "THERAPY", "ATTRIB"))) @ <>= preacher <- mediate(SATIS ~ THERAPY + (ATTRIB),data=sobel) #The example in Preacher and Hayes @ \end{scriptsize} \begin{figure}[htbp] \begin{center} \begin{scriptsize} <>= png('mediate.png') mediate.diagram(preacher) dev.off() @ \end{scriptsize} \includegraphics{mediate.png} \caption{A mediated model taken from Preacher and Hayes, 2004 and solved using the \pfun{mediate} function. The direct path from Therapy to Satisfaction has a an effect of .76, while the indirect path through Attribution has an effect of .33. Compare this to the normal regression graphic created by lmDiagram.} \label{fig:mediate} \end{center} \end{figure} \begin{figure}[htbp] \begin{center} \begin{scriptsize} <>= preacher.lm <- lmCor(SATIS ~ THERAPY + ATTRIB, data=sobel) #The example in Preacher and Hayes @ <>= png('preacherlm.png') diagram(preacher.lm) dev.off() @ \end{scriptsize} \includegraphics{preacherlm.png} \caption{The conventional regression model for the Preacher and Hayes, 2004 data set solved using the \pfun{lmCor} function. Compare this to the previous figure.} \label{fig:mediate} \end{center} \end{figure} \begin{itemize} \item \pfun{lmCor} will take raw data or a correlation matrix and find (and graph the path diagram) for multiple y variables depending upon multiple x variables. \begin{scriptsize} %\begin{Schunk} \begin{Sinput} lmCor(SATV + SATQ ~ education + age, data = sat.act, std=TRUE) \end{Sinput} %\end{Schunk} \end{scriptsize} \item \pfun{mediate} will take raw data or a correlation matrix and find (and graph the path diagram) for multiple y variables depending upon multiple x variables mediated through a mediation variable. It then tests the mediation effect using a boot strap. \begin{scriptsize} %\begin{Schunk} \begin{Sinput} mediate( SATV ~ education+ age + (ACT), data =sat.act,std=TRUE,n.iter=50) \end{Sinput} %\end{Schunk} \end{scriptsize} \item \pfun{mediate} will also take raw data and find (and graph the path diagram) a moderated multiple regression model for multiple y variables depending upon multiple x variables mediated through a mediation variable. It will form the product term either from the mean centered data or from the raw data. It then tests the mediation effect using a boot strap. The data set is taken from \cite{garcia:10}. The number of iterations for the boot strap was set to 50 for speed. The default number of boot straps is 5000. See the help page for the \pfun{mediate} function for more details. For a much longer discussion of how to use the \pfun{mediate} function, see the ``HowTo" Using \pfun{mediate} and \pfun{lmCor} to do \href{https://personality-project.org/r/psych/HowTo/mediation.pdf}{mediation, moderation and regression analysis}. \begin{figure}[htbp] \begin{center} \begin{scriptsize} <>= png('garcia.png') mediate(respappr ~ prot2 * sexism +(sexism),data=Garcia,n.iter=50 ,main="Moderated mediation (mean centered)") dev.off() @ \end{scriptsize} \includegraphics{garcia.png} \caption{Moderated multiple regression requires the raw data. By default, the data are mean centered before find the product term. } \label{default} \end{center} \end{figure} \end{itemize} \subsection{Set Correlation using \pfun{lmCor}} An important generalization of multiple regression and multiple correlation is \iemph{set correlation} developed by \cite{cohen:set} and discussed by \cite{cohen:03}. Set correlation is a multivariate generalization of multiple regression and estimates the amount of variance shared between two sets of variables. Set correlation also allows for examining the relationship between two sets when controlling for a third set. This is implemented in the \pfun{lmCor} function. Set correlation is $$R^{2} = 1 - \prod_{i=1}^n(1-\lambda_{i})$$ where $\lambda_{i}$ is the ith eigen value of the eigen value decomposition of the matrix $$R = R_{xx}^{-1}R_{xy}R_{xx}^{-1}R_{xy}^{-1}.$$ Unfortunately, there are several cases where set correlation will give results that are much too high. This will happen if some variables from the first set are highly related to those in the second set, even though most are not. In this case, although the set correlation can be very high, the degree of relationship between the sets is not as high. In this case, an alternative statistic, based upon the average canonical correlation might be more appropriate. \pfun{lmCor} has the additional feature that it will calculate multiple and partial correlations from the correlation or covariance matrix rather than the original data. Consider the correlations of the 6 variables in the \pfun{sat.act} data set. First do the normal multiple regression, and then compare it with the results using \pfun{lmCor}. Two things to notice. \pfun{lmCor} works on the \emph{correlation} or \emph{covariance} or \emph{raw data} matrix, and thus if using the correlation matrix, will report standardized or raw $\hat{\beta}$ weights. Secondly, it is possible to do several multiple regressions simultaneously. If the number of observations is specified, or if the analysis is done on raw data, statistical tests of significance are applied. For this example, the analysis is done on the correlation matrix rather than the raw data. \begin{scriptsize} <>= C <- cov(sat.act,use="pairwise") model1 <- lm(ACT~ gender + education + age, data=sat.act) summary(model1) @ Compare this with the output from \pfun{lmCor}. <>= #compare with sector setCor(c(4:6),c(1:3),C, n.obs=700) @ \end{scriptsize} Note that the \pfun{setCor} analysis also reports the amount of shared variance between the predictor set and the criterion (dependent) set. This set correlation is symmetric. That is, the $R^{2}$ is the same independent of the direction of the relationship. \section{Converting output to APA style tables using \LaTeX} Although for most purposes, using the \Rpkg{Sweave} or \Rpkg{KnitR} packages produces clean output, some prefer output pre formatted for APA style tables. This can be done using the \Rpkg{xtable} package for almost anything, but there are a few simple functions in \Rpkg{psych} for the most common tables. \pfun{fa2latex} will convert a factor analysis or components analysis output to a \LaTeX table, \pfun{cor2latex} will take a correlation matrix and show the lower (or upper diagonal), \pfun{irt2latex} converts the item statistics from the \pfun{irt.fa} function to more convenient \LaTeX output, and finally, \pfun{df2latex} converts a generic data frame to \LaTeX. An example of converting the output from \pfun{fa} to \LaTeX appears in Table~\ref{falatex}. % fa2latex % f3 % Called in the psych package fa2latex % Called in the psych package f3 \begin{scriptsize} \begin{table}[htpb] \caption{fa2latex} \begin{center} \begin{tabular} {l r r r r r r } \multicolumn{ 6 }{l}{ A factor analysis table from the psych package in R } \cr \hline Variable & MR1 & MR2 & MR3 & h2 & u2 & com \cr \hline Sentences & 0.91 & -0.04 & 0.04 & 0.82 & 0.18 & 1.01 \cr Vocabulary & 0.89 & 0.06 & -0.03 & 0.84 & 0.16 & 1.01 \cr Sent.Completion & 0.83 & 0.04 & 0.00 & 0.73 & 0.27 & 1.00 \cr First.Letters & 0.00 & 0.86 & 0.00 & 0.73 & 0.27 & 1.00 \cr 4.Letter.Words & -0.01 & 0.74 & 0.10 & 0.63 & 0.37 & 1.04 \cr Suffixes & 0.18 & 0.63 & -0.08 & 0.50 & 0.50 & 1.20 \cr Letter.Series & 0.03 & -0.01 & 0.84 & 0.72 & 0.28 & 1.00 \cr Pedigrees & 0.37 & -0.05 & 0.47 & 0.50 & 0.50 & 1.93 \cr Letter.Group & -0.06 & 0.21 & 0.64 & 0.53 & 0.47 & 1.23 \cr \hline \cr SS loadings & 2.64 & 1.86 & 1.5 & \cr\cr \hline \cr MR1 & 1.00 & 0.59 & 0.54 \cr MR2 & 0.59 & 1.00 & 0.52 \cr MR3 & 0.54 & 0.52 & 1.00 \cr \hline \end{tabular} \end{center} \label{falatex} \end{table} \end{scriptsize} \newpage \section{Miscellaneous functions} A number of functions have been developed for some very specific problems that don't fit into any other category. The following is an incomplete list. Look at the \iemph{Index} for \Rpkg{psych} for a list of all of the functions. \begin{description} \item [\pfun{block.random}] Creates a block randomized structure for n independent variables. Useful for teaching block randomization for experimental design. \item [\pfun{df2latex}] is useful for taking tabular output (such as a correlation matrix or that of \pfun{describe} and converting it to a \LaTeX{} table. May be used when Sweave is not convenient. \item [\pfun{cor2latex}] Will format a correlation matrix in APA style in a \LaTeX{} table. See also \pfun{fa2latex} and \pfun{irt2latex}. \item [\pfun{cosinor}] One of several functions for doing \iemph{circular statistics}. This is important when studying mood effects over the day which show a diurnal pattern. See also \pfun{circadian.mean}, \pfun{circadian.cor} and \pfun{circadian.linear.cor} for finding circular means, circular correlations, and correlations of circular with linear data. \item[\pfun{fisherz}] Convert a correlation to the corresponding Fisher z score. \item [\pfun{geometric.mean}] also \pfun{harmonic.mean} find the appropriate mean for working with different kinds of data. \item [\pfun{ICC}] and \pfun{cohen.kappa} are typically used to find the reliability for raters. \item [\pfun{headtail}] combines the \fun{head} and \fun{tail} functions to show the first and last lines of a data set or output. \item [\pfun{topBottom}] Same as headtail. Combines the \fun{head} and \fun{tail} functions to show the first and last lines of a data set or output, but does not add ellipsis between. \item [\pfun{mardia}] calculates univariate or multivariate (Mardia's test) skew and kurtosis for a vector, matrix, or data.frame \item [\pfun{p.rep}] finds the probability of replication for an F, t, or r and estimate effect size. \item [\pfun{partial.r}] partials a y set of variables out of an x set and finds the resulting partial correlations. (See also \pfun{set.cor}.) \item [\pfun{rangeCorrection}] will correct correlations for restriction of range. \item [\pfun{reverse.code}] will reverse code specified items. Done more conveniently in most \Rpkg{psych} functions, but supplied here as a helper function when using other packages. \item [\pfun{superMatrix}] Takes two or more matrices, e.g., A and B, and combines them into a ``Super matrix'' with A on the top left, B on the lower right, and 0s for the other two quadrants. A useful trick when forming complex keys, or when forming example problems. \end{description} \section{Data sets} A number of data sets for demonstrating psychometric techniques are included in the \Rpkg{psych} package. These include six data sets showing a hierarchical factor structure (five cognitive examples, \pfun{Thurstone}, \pfun{Thurstone.33}, \pfun{Holzinger}, \pfun{Bechtoldt.1}, \pfun{Bechtoldt.2}, and one from health psychology \pfun{Reise}). One of these (\pfun{Thurstone}) is used as an example in the \Rpkg{sem} package as well as \cite{mcdonald:tt}. The original data are from \cite{thurstone:41} and reanalyzed by \cite{bechtoldt:61}. Personality item data representing five personality factors on 25 items (\pfun{bfi}), 135 items for 4,000 participants (\pfun{spi}) or 13 personality inventory scores (\pfun{epi.bfi}), and 16 multiple choice iq items (\pfun{iqitems}, \pfun{ability}). The \pfun{vegetables} example has paired comparison preferences for 9 vegetables. This is an example of Thurstonian scaling used by \cite{guilford:54} and \cite{nunnally:67}. Other data sets include \pfun{cubits}, \pfun{peas}, and \pfun{heights} from Galton. \begin{description} \item[Thurstone] Holzinger-Swineford (1937) introduced the bifactor model of a general factor and uncorrelated group factors. The Holzinger correlation matrix is a 14 * 14 matrix from their paper. The Thurstone correlation matrix is a 9 * 9 matrix of correlations of ability items. The Reise data set is 16 * 16 correlation matrix of mental health items. The Bechtholdt data sets are both 17 x 17 correlation matrices of ability tests. \item [bfi] 25 personality self report items taken from the International Personality Item Pool (ipip.ori.org) were included as part of the Synthetic Aperture Personality Assessment (\iemph{SAPA}) web based personality assessment project. The data from 2800 subjects are included here as a demonstration set for scale construction, factor analysis and Item Response Theory analyses. \item [spi] 135 personality items and 10 demographic items for 4,000 subjects are taken from the Synthetic Aperture Personality Assessment (\iemph{SAPA}) web based personality assessment project \cite{sapa:16}. These 135 items form part of the SAPA Personality Inventory \cite{condon:spi}. \item [sat.act] Self reported scores on the SAT Verbal, SAT Quantitative and ACT were collected as part of the Synthetic Aperture Personality Assessment (\iemph{SAPA}) web based personality assessment project. Age, gender, and education are also reported. The data from 700 subjects are included here as a demonstration set for correlation and analysis. \item [epi.bfi] A small data set of 5 scales from the Eysenck Personality Inventory, 5 from a Big 5 inventory, a Beck Depression Inventory, and State and Trait Anxiety measures. Used for demonstrations of correlations, regressions, graphic displays. \item [iqitems] 16 multiple choice ability items were included as part of the Synthetic Aperture Personality Assessment (\iemph{SAPA}) web based personality assessment project. The data from 1525 subjects are included here as a demonstration set for scoring multiple choice inventories and doing basic item statistics. \item [ability] The same 16 items, converted to 0,1 scores are used for examples of various IRT procedures. These data are from the \emph{International Cognitive Ability Resource} (ICAR) \cite{condon:icar:14} and were collected as part of the SAPA web based assessment \href{ https://sapa-project.org}{ https://sapa-project.org} project \cite{sapa:16}. \item [galton] Two of the earliest examples of the correlation coefficient were Francis Galton's data sets on the relationship between mid parent and child height and the similarity of parent generation peas with child peas. \pfun{galton} is the data set for the Galton height. \pfun{peas} is the data set Francis Galton used to introduce the correlation coefficient with an analysis of the similarities of the parent and child generation of 700 sweet peas. \item[Dwyer] \cite{dwyer:37} introduced a method for \emph{factor extension} (see \pfun{fa.extension} that finds loadings on factors from an original data set for additional (extended) variables. This data set includes his example. \item [miscellaneous] \pfun{cities} is a matrix of airline distances between 11 US cities and may be used for demonstrating multiple dimensional scaling. \pfun{vegetables} is a classic data set for demonstrating Thurstonian scaling and is the preference matrix of 9 vegetables from \cite{guilford:54}. Used by \cite{guilford:54,nunnally:67,nunnally:bernstein:84}, this data set allows for examples of basic scaling techniques. \end{description} \section{Development version and a users guide} The most recent development version is available as a source file at the repository maintained at \href{ href="https://personality-project.org/r"}{\url{https://personality-project.org/r}}. That version will have removed the most recently discovered bugs (but perhaps introduced other, yet to be discovered ones). To download that version, go to the repository %\href{"http://personality-project.org/r/src/contrib/}{ \url{http://personality-project.org/r/src/contrib/} and wander around. For both Macs and PC, this version can be installed directly using the ``other repository" option in the package installer. Make sure to specify type="source" %\begin{Schunk} \begin{Sinput} > install.packages("psych", repos="https://personality-project.org/r", type="source") \end{Sinput} %\end{Schunk} % For a PC, the zip file for the most recent release has been created using the win-builder facility at CRAN. The development release for the Mac is usually several weeks ahead of the PC development version. Although the individual help pages for the \Rpkg{psych} package are available as part of \R{} and may be accessed directly (e.g. ?psych) , the full manual for the \pfun{psych} package is also available as a pdf at \url{https://personality-project.org/r/psych_manual.pdf} %psych\_manual.pdf. News and a history of changes are available in the NEWS and CHANGES files in the source files. To view the most recent news, %\begin{Schunk} \begin{Sinput} news(Version >= "2.3.5",package="psych") \end{Sinput} %\end{Schunk} \section{Psychometric Theory} The \Rpkg{psych} package has been developed to help psychologists do basic research. Many of the functions were developed to supplement a book (\url{https://personality-project.org/r/book} An introduction to Psychometric Theory with Applications in \R{} \citep{revelle:intro} More information about the use of some of the functions may be found in the book . For more extensive discussion of the use of \Rpkg{psych} in particular and \R{} in general, consult \url{https://personality-project.org/r/r.guide.html} A short guide to R. \section{SessionInfo} This document was prepared using the following settings. \begin{tiny} <>= sessionInfo() @ \end{tiny} \newpage %\bibliography{/Volumes/WR/Documents/Active/book/all} %\bibliography{all} \begin{thebibliography}{} \bibitem[\protect\astroncite{Bechtoldt}{1961}]{bechtoldt:61} Bechtoldt, H. (1961). \newblock An empirical study of the factor analysis stability hypothesis. \newblock {\em Psychometrika}, 26(4):405--432. \bibitem[\protect\astroncite{Blashfield}{1980}]{blashfield:80} Blashfield, R.~K. (1980). \newblock The growth of cluster analysis: {Tryon, Ward, and Johnson}. \newblock {\em Multivariate Behavioral Research}, 15(4):439 -- 458. \bibitem[\protect\astroncite{Blashfield and Aldenderfer}{1988}]{blashfield:88} Blashfield, R.~K. and Aldenderfer, M.~S. (1988). \newblock The methods and problems of cluster analysis. \newblock In Nesselroade, J.~R. and Cattell, R.~B., editors, {\em Handbook of multivariate experimental psychology (2nd ed.)}, pages 447--473. Plenum Press, New York, NY. \bibitem[\protect\astroncite{Bliese}{2009}]{bliese:09} Bliese, P.~D. (2009). \newblock Multilevel modeling in r (2.3) a brief introduction to r, the multilevel package and the nlme package. \bibitem[\protect\astroncite{Cattell}{1966}]{cattell:scree} Cattell, R.~B. (1966). \newblock The scree test for the number of factors. \newblock {\em Multivariate Behavioral Research}, 1(2):245--276. \bibitem[\protect\astroncite{Cattell}{1978}]{cattell:fa78} Cattell, R.~B. (1978). \newblock {\em The scientific use of factor analysis}. \newblock Plenum Press, New York. \bibitem[\protect\citeauthoryear{Bernaards \& Jennrich}{Bernaards \& Jennrich}{2005}]{gpa.rotate} Coen A. Bernaards \& Robert I. Jennrich (2005). \newblock Gradient Projection Algorithms and Software for Arbitrary Rotation Criteria in Factor Analysis \newblock {\em Educational and Psychological Measurement}, {\em 65}, 676-696. \bibitem[\protect\astroncite{Cohen}{1982}]{cohen:set} Cohen, J. (1982). \newblock Set correlation as a general multivariate data-analytic method. \newblock {\em Multivariate Behavioral Research}, 17(3). \bibitem[\protect\astroncite{Cohen et~al.}{2003}]{cohen:03} Cohen, J., Cohen, P., West, S.~G., and Aiken, L.~S. (2003). \newblock {\em Applied multiple regression/correlation analysis for the behavioral sciences}. \newblock L. Erlbaum Associates, Mahwah, N.J., 3rd ed edition. \bibitem[\protect\citeauthoryear{Condon \& Revelle}{Condon \& Revelle}{2014}]{condon:icar:14} Condon, D.~M. \& Revelle, W. (2014). \newblock The {International Cognitive Ability Resource}: Development and initial validation of a public-domain measure. \newblock {\em Intelligence}, {\em 43}, 52--64. \bibitem[\protect\astroncite{Cooksey and Soutar}{2006}]{cooksey:06} Cooksey, R. and Soutar, G. (2006). \newblock Coefficient beta and hierarchical item clustering - an analytical procedure for establishing and displaying the dimensionality and homogeneity of summated scales. \newblock {\em Organizational Research Methods}, 9:78--98. \bibitem[\protect\astroncite{Cronbach}{1951}]{cronbach:51} Cronbach, L.~J. (1951). \newblock Coefficient alpha and the internal structure of tests. \newblock {\em Psychometrika}, 16:297--334. \bibitem[\protect\astroncite{Dwyer}{1937}]{dwyer:37} Dwyer, P.~S. (1937). \newblock The determination of the factor loadings of a given test from the known factor loadings of other tests. \newblock {\em Psychometrika}, 2(3):173--178. \bibitem[\protect\astroncite{Eagly and Revelle}{2022}]{eagly:revelle} Eagly and Revelle (2022). \newblock Understanding the Magnitude of Psychological Differences Between Women and Men Requires Seeing the Forest and the Trees (in press) \newblock {\em Perspectives in Psychological Science}) \bibitem[\protect\astroncite{Everitt}{1974}]{everitt:74} Everitt, B. (1974). \newblock {\em Cluster analysis}. \newblock John Wiley \& Sons, Cluster analysis. 122 pp. Oxford, England. \bibitem[\protect\astroncite{Fox et~al.}{2012}]{sem} Fox, J., Nie, Z., and Byrnes, J. (2012). \newblock {\em {sem: Structural Equation Models}}. \bibitem[\protect\astroncite{Garcia et~al.}{2010}]{garcia:10} Garcia, D.~M., Schmitt, M.~T., Branscombe, N.~R., and Ellemers, N. (2010). \newblock Women's reactions to ingroup members who protest discriminatory treatment: The importance of beliefs about inequality and response appropriateness. \newblock {\em European Journal of Social Psychology}, 40(5):733--745. \bibitem[\protect\astroncite{Grice}{2001}]{grice:01} Grice, J.~W. (2001). \newblock Computing and evaluating factor scores. \newblock {\em Psychological Methods}, 6(4):430--450. \bibitem[\protect\astroncite{Gruber et al. }{2020}]{gruber:20} Gruber, Freya M. and Distlberger, Eva and Scherndl, Thomas and Ortner, Tuulia M. and Pletzer, Belinda (2020) \newblock Psychometric properties of the multifaceted Gender-Related Attributes Survey (GERAS) \newblock {\em European Journal of Psychological Assessment.}, 36, (4) 612-623 \bibitem[\protect\astroncite{Guilford}{1954}]{guilford:54} Guilford, J.~P. (1954). \newblock {\em Psychometric Methods}. \newblock McGraw-Hill, New York, 2nd edition. \bibitem[\protect\astroncite{Guttman}{1945}]{guttman:45} Guttman, L. (1945). \newblock A basis for analyzing test-retest reliability. \newblock {\em Psychometrika}, 10(4):255--282. \bibitem[\protect\astroncite{Hartigan}{1975}]{hartigan:75} Hartigan, J.~A. (1975). \newblock {\em Clustering Algorithms}. \newblock John Wiley \& Sons, Inc., New York, NY, USA. \bibitem[\protect\astroncite{Hayes}{2013}]{hayes:13} Hayes, A.~F. (2013). \newblock {\em Introduction to mediation, moderation, and conditional process analysis: A regression-based approach}. \newblock Guilford Press, New York. \bibitem[\protect\astroncite{Henry et~al.}{2005}]{henry:05} Henry, D.~B., Tolan, P.~H., and Gorman-Smith, D. (2005). \newblock Cluster analysis in family psychology research. \newblock {\em Journal of Family Psychology}, 19(1):121--132. \bibitem[\protect\astroncite{Holm}{1979}]{holm:79} Holm, S. (1979). \newblock A simple sequentially rejective multiple test procedure. \newblock {\em Scandinavian Journal of Statistics}, 6(2):pp. 65--70. \bibitem[\protect\astroncite{Holzinger and Swineford}{1937}]{holzinger:37} Holzinger, K. and Swineford, F. (1937). \newblock The bi-factor method. \newblock {\em Psychometrika}, 2(1):41--54. \bibitem[\protect\astroncite{Horn}{1965}]{horn:65} Horn, J. (1965). \newblock A rationale and test for the number of factors in factor analysis. \newblock {\em Psychometrika}, 30(2):179--185. \bibitem[\protect\astroncite{Horn and Engstrom}{1979}]{horn:79} Horn, J.~L. and Engstrom, R. (1979). \newblock Cattell's scree test in relation to bartlett's chi-square test and other observations on the number of factors problem. \newblock {\em Multivariate Behavioral Research}, 14(3):283--300. \bibitem[\protect\astroncite{Jennrich and Bentler}{2011}]{jennrich:11} Jennrich, R. and Bentler, P. (2011). \newblock Exploratory bi-factor analysis. \newblock {\em Psychometrika}, pages 1--13. \newblock 10.1007/s11336-011-9218-4. \bibitem[\protect\astroncite{Jensen and Weng}{1994}]{jensen:weng} Jensen, A.~R. and Weng, L.-J. (1994). \newblock What is a good g? \newblock {\em Intelligence}, 18(3):231--258. \bibitem[\protect\astroncite{Loevinger et~al.}{1953}]{loevinger:53} Loevinger, J., Gleser, G., and DuBois, P. (1953). \newblock Maximizing the discriminating power of a multiple-score test. \newblock {\em Psychometrika}, 18(4):309--317. \bibitem[\protect\astroncite{MacCallum et~al.}{2007}]{maccallum:07} MacCallum, R.~C., Browne, M.~W., and Cai, L. (2007). \newblock Factor analysis models as approximations. \newblock In Cudeck, R. and MacCallum, R.~C., editors, {\em Factor analysis at 100: Historical developments and future directions}, pages 153--175. Lawrence Erlbaum Associates Publishers, Mahwah, NJ. \bibitem[\protect\astroncite{Martinent and Ferrand}{2007}]{martinent:07} Martinent, G. and Ferrand, C. (2007). \newblock A cluster analysis of precompetitive anxiety: Relationship with perfectionism and trait anxiety. \newblock {\em Personality and Individual Differences}, 43(7):1676--1686. \bibitem[\protect\astroncite{McDonald}{1999}]{mcdonald:tt} McDonald, R.~P. (1999). \newblock {\em Test theory: {A} unified treatment}. \newblock L. Erlbaum Associates, Mahwah, N.J. \bibitem[\protect\astroncite{Mun et~al.}{2008}]{mun:08} Mun, E.~Y., von Eye, A., Bates, M.~E., and Vaschillo, E.~G. (2008). \newblock Finding groups using model-based cluster analysis: Heterogeneous emotional self-regulatory processes and heavy alcohol use risk. \newblock {\em Developmental Psychology}, 44(2):481--495. \bibitem[\protect\astroncite{Nunnally}{1967}]{nunnally:67} Nunnally, J.~C. (1967). \newblock {\em Psychometric theory}. \newblock McGraw-Hill, New York,. \bibitem[\protect\astroncite{Nunnally and Bernstein}{1984}]{nunnally:bernstein:84} Nunnally, J.~C. and Bernstein, I.~H. (1984). \newblock {\em Psychometric theory}. \newblock McGraw-Hill, New York,, 3rd edition. \bibitem[\protect\astroncite{Pedhazur}{1997}]{pedhazur:97} Pedhazur, E. (1997). \newblock {\em Multiple regression in behavioral research: explanation and prediction}. \newblock Harcourt Brace College Publishers. \bibitem[Preacher and Hayes, 2004]{preacher:04} Preacher, K.~J. and Hayes, A.~F. (2004). \newblock {SPSS and SAS} procedures for estimating indirect effects in simple mediation models. \newblock {\em Behavior Research Methods, Instruments, \& Computers}, 36(4):717--731. \bibitem[\protect\astroncite{Revelle}{1979}]{revelle:iclust} Revelle, W. (1979). \newblock Hierarchical cluster-analysis and the internal structure of tests. \newblock {\em Multivariate Behavioral Research}, 14(1):57--74. \bibitem[\protect\astroncite{Revelle}{2023}]{psych} Revelle, W. (2032). \newblock {\em psych: Procedures for Personality and Psychological Research}. \newblock Northwestern University, Evanston. \newblock R package version 2.3.5 \bibitem[\protect\astroncite{Revelle}{prep}]{revelle:intro} Revelle, W. ({in prep}). \newblock {\em An introduction to psychometric theory with applications in {R}}. \newblock Springer. \bibitem[Revelle and Condon, 2018]{rc:reliability} Revelle, W. and Condon, D.~M. (2018). \newblock Reliability. \newblock In Irwing, P., Booth, T., and Hughes, D., editors, {\em Wiley-Blackwell Handbook of Psychometric Testing}. Wiley-Blackwell 2018). \bibitem[Revelle and Condon, 2019]{rc:pa} Revelle, W. and Condon, D.~M. (2019). \newblock Reliability from alpha to omega: A tutorial. \newblock {\em Psychological Assessment} 31, 12, 1395-1411. https://doi.org/10.1037/pas0000754. \url{https://psyarxiv.com/2y3w9/} Preprint available from PsyArxiv \bibitem[\protect\astroncite{Revelle et~al.}{2011}]{rcw:methods} Revelle, W., Condon, D., and Wilt, J. (2011). \newblock Methodological advances in differential psychology. \newblock In Chamorro-Premuzic, T., Furnham, A., and von Stumm, S., editors, {\em Handbook of Individual Differences}, chapter~2, pages 39--73. Wiley-Blackwell. \bibitem[\protect\citeauthoryear{Revelle, Condon, Wilt, French, Brown \& Elleman}{Revelle et~al.}{2016}]{sapa:16} Revelle, W., Condon, D.~M., Wilt, J., French, J.~A., Brown, A., \& Elleman, L.~G. (2016). \newblock Web and phone based data collection using planned missing designs. \newblock In N.~G. Fielding, R.~M. Lee, \& G.~Blank (Eds.), {\em SAGE Handbook of Online Research Methods\/} (2nd ed.). chapter~37, (pp.\ 578--595). Sage Publications, Inc. \bibitem[\protect\astroncite{Revelle and Rocklin}{1979}]{revelle:vss} Revelle, W. and Rocklin, T. (1979). \newblock {Very Simple Structure} - alternative procedure for estimating the optimal number of interpretable factors. \newblock {\em Multivariate Behavioral Research}, 14(4):403--414. \bibitem[\protect\astroncite{Revelle et~al.}{2010}]{rwr:sapa} Revelle, W., Wilt, J., and Rosenthal, A. (2010). \newblock Personality and cognition: The personality-cognition link. \newblock In Gruszka, A., Matthews, G., and Szymura, B., editors, {\em Handbook of Individual Differences in Cognition: Attention, Memory and Executive Control}, chapter~2, pages 27--49. Springer. \bibitem[\protect\astroncite{Revelle and Zinbarg}{2009}]{rz:09} Revelle, W. and Zinbarg, R.~E. (2009). \newblock Coefficients alpha, beta, omega and the glb: comments on {Sijtsma}. \newblock {\em Psychometrika}, 74(1):145--154. \bibitem[\protect\astroncite{Schmid and Leiman}{1957}]{schmid:57} Schmid, J.~J. and Leiman, J.~M. (1957). \newblock The development of hierarchical factor solutions. \newblock {\em Psychometrika}, 22(1):83--90. \bibitem[\protect\astroncite{Shrout and Fleiss}{1979}]{shrout:79} Shrout, P.~E. and Fleiss, J.~L. (1979). \newblock Intraclass correlations: Uses in assessing rater reliability. \newblock {\em Psychological Bulletin}, 86(2):420--428. \bibitem[\protect\astroncite{Smillie et~al.}{2012}]{smillie:jpsp} Smillie, L.~D., Cooper, A., Wilt, J., and Revelle, W. (2012). \newblock Do extraverts get more bang for the buck? refining the affective-reactivity hypothesis of extraversion. \newblock {\em Journal of Personality and Social Psychology}, 103(2):306--326. \bibitem[\protect\astroncite{Sneath and Sokal}{1973}]{sneath:73} Sneath, P. H.~A. and Sokal, R.~R. (1973). \newblock {\em Numerical taxonomy: the principles and practice of numerical classification}. \newblock A Series of books in biology. W. H. Freeman, San Francisco. \bibitem[\protect\astroncite{Sokal and Sneath}{1963}]{sokal:63} Sokal, R.~R. and Sneath, P. H.~A. (1963). \newblock {\em Principles of numerical taxonomy}. \newblock A Series of books in biology. W. H. Freeman, San Francisco. \bibitem[\protect\astroncite{Spearman}{1904}]{spearman:rho} Spearman, C. (1904). \newblock The proof and measurement of association between two things. \newblock {\em The American Journal of Psychology}, 15(1):72--101. \bibitem[\protect\astroncite{Steiger}{1980}]{steiger:80b} Steiger, J.~H. (1980). \newblock Tests for comparing elements of a correlation matrix. \newblock {\em Psychological Bulletin}, 87(2):245--251. \bibitem[\protect\astroncite{Tal-Or et~al.}{2010}]{talor:10} Tal-Or, N., Cohen, J., Tsfati, Y., and Gunther, A.~C. (2010). \newblock Testing causal direction in the influence of presumed media influence. \newblock {\em Communication Research}, 37(6):801--824. \bibitem[\protect\astroncite{Thorburn}{1918}]{thornburn:1918} Thorburn, W.~M. (1918). \newblock The myth of occam's razor. \newblock {\em Mind}, 27:345--353. \bibitem[\protect\astroncite{Thurstone and Thurstone}{1941}]{thurstone:41} Thurstone, L.~L. and Thurstone, T.~G. (1941). \newblock {\em Factorial studies of intelligence}. \newblock The University of Chicago press, Chicago, Ill. \bibitem[\protect\astroncite{Tryon}{1935}]{tryon:35} Tryon, R.~C. (1935). \newblock A theory of psychological components--an alternative to "mathematical factors.". \newblock {\em Psychological Review}, 42(5):425--454. \bibitem[\protect\astroncite{Tryon}{1939}]{tryon:39} Tryon, R.~C. (1939). \newblock {\em Cluster analysis}. \newblock Edwards Brothers, Ann Arbor, Michigan. \bibitem[\protect\astroncite{Velicer}{1976}]{velicer:76} Velicer, W. (1976). \newblock Determining the number of components from the matrix of partial correlations. \newblock {\em Psychometrika}, 41(3):321--327. \bibitem[\protect\astroncite{Zinbarg et~al.}{2005}]{zinbarg:pm:05} Zinbarg, R.~E., Revelle, W., Yovel, I., and Li, W. (2005). \newblock Cronbach's {$\alpha$}, {Revelle's} {$\beta$}, and {McDonald's} {$\omega_H$}): Their relations with each other and two alternative conceptualizations of reliability. \newblock {\em Psychometrika}, 70(1):123--133. \bibitem[\protect\astroncite{Zinbarg et~al.}{2006}]{zinbarg:apm:06} Zinbarg, R.~E., Yovel, I., Revelle, W., and McDonald, R.~P. (2006). \newblock Estimating generalizability to a latent variable common to all of a scale's indicators: A comparison of estimators for {$\omega_h$}. \newblock {\em Applied Psychological Measurement}, 30(2):121--144. \end{thebibliography} \printindex \end{document}