#Running code and debugging it First we show some simple functions and step through them using the debug and browser functions. We do this using some simple functions.

#Dimension reduction through multivariate analysis

library(psych)
library(psychTools)

First, look at the Factor analysis handout. This discusses the problem of iterative fit as well as the basic logic of factor analysis.

###Iterative Fit

The basic issue in fitting is to try approximations and then adjust the fit to get a better approximation.

Consider an iterative solution the square root. (Clearly not as powerful as just asking for `sqrt’)

iterative.sqrt <- function(X,guess) {
#a dumb guess

if(missing(guess)) guess <- 1 
 iterate <- function(X,guess) {
    low <- guess
    high <- X/guess
    guess <-((high+low)/2)
    guess}   
Iter <- matrix(NA,ncol=3,nrow=10)
colnames(Iter) <- c("Guess", 
      "X/Guess","Error")
for (i in 1:10) {  # a small loop
Iter[i,1] <- guess
Iter[i,2] <- guess <- iterate(X,guess)
Iter[i,3] <- Iter[i,1]- Iter[i,2]
}
Iter   } #this returns the value

result <- iterative.sqrt(47,1)

matplot(result,type="b")  #plot them using matplot

Do this function again, with a different starting value

result <- iterative.sqrt(7228)
result  #show the numbers
##            Guess    X/Guess         Error
##  [1,]    1.00000 3614.50000 -3.613500e+03
##  [2,] 3614.50000 1808.24986  1.806250e+03
##  [3,] 1808.24986  906.12355  9.021263e+02
##  [4,]  906.12355  457.05019  4.490734e+02
##  [5,]  457.05019  236.43232  2.206179e+02
##  [6,]  236.43232  133.50172  1.029306e+02
##  [7,]  133.50172   93.82167  3.968005e+01
##  [8,]   93.82167   85.43072  8.390951e+00
##  [9,]   85.43072   85.01864  4.120770e-01
## [10,]   85.01864   85.01765  9.986482e-04
matplot(result,type="b")

Multidimensional analysis

Many procedures use this concept of iterative fitting

Some, such as principal components are ``closed form” and can just be solved directly

Others, such as ``factor analysis” need to find solutions by successive approximations

The issue of factor or component rotation is an iterative procedure.

Principal Components and Factor analysis are all the result of some basic matrix equations to approximate a matrix.

A brief digression about matrices

Matrices have been used to solve simultaneous equations since at least the time of the Babylonians and the early Chinese. They were first introduced into psychology in the early 1930s. They are used in almost all statistical tests. Somes the use is hidden to the user, sometimes it is made explicit.

As we know, a matrix is just a two dimensional representation of our data. All elements of the matrix must be of the same type. For data analysis, the elements of a matrix are numbers. Matrices have rows and columns. There dimensinality is found by using dim.)

Correlations as matrix operations

For a data matrix X, if we convert the observations to deviation scores (i.e., subtract all scores from the mean), then find the matrix product of X with itself, and divide by the number of subjects, we have a Covariance matrix. The diagonal of that matix is a vector of variances.

\[C = X'X/(N-1) \]

Show this with a small data set

We use the attitude data set

library(psych)
X <- attitude
head(X) #show the first five lines
##   rating complaints privileges learning raises critical advance
## 1     43         51         30       39     61       92      45
## 2     63         64         51       54     63       73      47
## 3     71         70         68       69     76       86      48
## 4     61         63         45       47     54       84      35
## 5     81         78         56       66     71       83      47
## 6     43         55         49       44     54       49      34
dim(X) 
## [1] 30  7
x <-scale(X,scale=FALSE) # find the deviation scores using the scale function
head(x)
##          rating complaints privileges   learning     raises   critical   advance
## [1,] -21.633333      -15.6 -23.133333 -17.366667  -3.633333  17.233333  2.066667
## [2,]  -1.633333       -2.6  -2.133333  -2.366667  -1.633333  -1.766667  4.066667
## [3,]   6.366667        3.4  14.866667  12.633333  11.366667  11.233333  5.066667
## [4,]  -3.633333       -3.6  -8.133333  -9.366667 -10.633333   9.233333 -7.933333
## [5,]  16.366667       11.4   2.866667   9.633333   6.366667   8.233333  4.066667
## [6,] -21.633333      -11.6  -4.133333 -12.366667 -10.633333 -25.766667 -8.933333
z <- scale(X) #or find the standard scores
head(z)
##          rating complaints privileges   learning     raises   critical    advance
## [1,] -1.7772211 -1.1716323 -1.8906841 -1.4796496 -0.3494522  1.7416366  0.2008675
## [2,] -0.1341816 -0.1952721 -0.1743570 -0.2016413 -0.1570932 -0.1785430  0.3952554
## [3,]  0.5230342  0.2553558  1.2150506  1.0763670  1.0932404  1.1352641  0.4924494
## [4,] -0.2984855 -0.2703767 -0.6647362 -0.7980452 -1.0227087  0.9331399 -0.7710720
## [5,]  1.3445540  0.8561929  0.2342923  0.8207653  0.6123428  0.8320778  0.3952554
## [6,] -1.7772211 -0.8712138 -0.3378168 -1.0536469 -1.0227087 -2.6040331 -0.8682660
C <- t(x) %*%x/(nrow(x)-1) #matrix multiplication  gives the covariances
round(C,2)
##            rating complaints privileges learning raises critical advance
## rating     148.17     133.78      63.46    89.10  74.69    18.84   19.42
## complaints 133.78     177.28      90.95    93.26  92.64    24.73   30.77
## privileges  63.46      90.95     149.71    70.85  56.67    17.83   43.22
## learning    89.10      93.26      70.85   137.76  78.14    13.47   64.20
## raises      74.69      92.64      56.67    78.14 108.10    38.77   61.42
## critical    18.84      24.73      17.83    13.47  38.77    97.91   28.85
## advance     19.42      30.77      43.22    64.20  61.42    28.85  105.86
V <- diag(C)
sdi <- diag(1/sqrt(V))  #divide by the standard deviation
R <- sdi %*% C %*% t(sdi)
round(R,2)
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 1.00 0.83 0.43 0.62 0.59 0.16 0.16
## [2,] 0.83 1.00 0.56 0.60 0.67 0.19 0.22
## [3,] 0.43 0.56 1.00 0.49 0.45 0.15 0.34
## [4,] 0.62 0.60 0.49 1.00 0.64 0.12 0.53
## [5,] 0.59 0.67 0.45 0.64 1.00 0.38 0.57
## [6,] 0.16 0.19 0.15 0.12 0.38 1.00 0.28
## [7,] 0.16 0.22 0.34 0.53 0.57 0.28 1.00
r <- t(z) %*% z/(nrow(z)-1) #or the correlations matrix
round(r,2)
##            rating complaints privileges learning raises critical advance
## rating       1.00       0.83       0.43     0.62   0.59     0.16    0.16
## complaints   0.83       1.00       0.56     0.60   0.67     0.19    0.22
## privileges   0.43       0.56       1.00     0.49   0.45     0.15    0.34
## learning     0.62       0.60       0.49     1.00   0.64     0.12    0.53
## raises       0.59       0.67       0.45     0.64   1.00     0.38    0.57
## critical     0.16       0.19       0.15     0.12   0.38     1.00    0.28
## advance      0.16       0.22       0.34     0.53   0.57     0.28    1.00
#or
r <- lowerCor(X) 
##            ratng cmpln prvlg lrnng raiss crtcl advnc
## rating     1.00                                     
## complaints 0.83  1.00                               
## privileges 0.43  0.56  1.00                         
## learning   0.62  0.60  0.49  1.00                   
## raises     0.59  0.67  0.45  0.64  1.00             
## critical   0.16  0.19  0.15  0.12  0.38  1.00       
## advance    0.16  0.22  0.34  0.53  0.57  0.28  1.00

Principal Components analysis and Factor Analysis

Conceptually, we are just taking the square root of a correlation matrix:

\[R \approx CC' \] or \[R \approx F F' + U2 \]

For any given correlation matrix, R, can we find a C or an F matrix which approximates it?

A short digression into Eigen decomposition (optional)

A square matrix R can be decomposed into a matrix of orthogonal eigen vectors and a set of scaling eigen values

\[R = X \lambda X'\] such that \[X X' = I \]

We do this using the eigen function

ev <- eigen(R)   #find the eigen values and eigen vectors
ev
## eigen() decomposition
## $values
## [1] 3.7163758 1.1409219 0.8471915 0.6128697 0.3236728 0.2185306 0.1404378
## 
## $vectors
##            [,1]        [,2]       [,3]       [,4]         [,5]        [,6]         [,7]
## [1,] -0.4130048  0.39692583 -0.2634492  0.2341088 -0.143063817  0.41241010 -0.597591161
## [2,] -0.4405379  0.33362706 -0.2256118  0.0022033  0.278064283  0.22805897  0.717205075
## [3,] -0.3547748  0.09575954  0.1882432 -0.8906996 -0.005272287 -0.07598537 -0.174304627
## [4,] -0.4285613  0.04510225  0.3252857  0.2393794 -0.697628303 -0.35282836  0.200036529
## [5,] -0.4471312 -0.17917304 -0.0404021  0.2428067  0.556488662 -0.58548329 -0.234335158
## [6,] -0.1853508 -0.60263473 -0.7008081 -0.1493497 -0.292800611  0.01154899  0.056334214
## [7,] -0.3025594 -0.56979573  0.4956644  0.1152367  0.141732580  0.55201569  0.004296286
evect <- ev$vectors  #the eigen vectors
evalue = ev$values   #the eigen values
round(evect %*% diag(evalue) %*% t(evect), 2)
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 1.00 0.83 0.43 0.62 0.59 0.16 0.16
## [2,] 0.83 1.00 0.56 0.60 0.67 0.19 0.22
## [3,] 0.43 0.56 1.00 0.49 0.45 0.15 0.34
## [4,] 0.62 0.60 0.49 1.00 0.64 0.12 0.53
## [5,] 0.59 0.67 0.45 0.64 1.00 0.38 0.57
## [6,] 0.16 0.19 0.15 0.12 0.38 1.00 0.28
## [7,] 0.16 0.22 0.34 0.53 0.57 0.28 1.00

Principal components are just a simple way of representing eigen values and eigen vectors

if \[R = X \lambda X' \] then we call the principal component \[C = X \sqrt(\lambda)\]

PC = evect %*% diag(sqrt(evalue))
round(PC,2)
##       [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]
## [1,] -0.80  0.42 -0.24  0.18 -0.08  0.19 -0.22
## [2,] -0.85  0.36 -0.21  0.00  0.16  0.11  0.27
## [3,] -0.68  0.10  0.17 -0.70  0.00 -0.04 -0.07
## [4,] -0.83  0.05  0.30  0.19 -0.40 -0.16  0.07
## [5,] -0.86 -0.19 -0.04  0.19  0.32 -0.27 -0.09
## [6,] -0.36 -0.64 -0.65 -0.12 -0.17  0.01  0.02
## [7,] -0.58 -0.61  0.46  0.09  0.08  0.26  0.00
round(PC %*% t(PC),2)
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 1.00 0.83 0.43 0.62 0.59 0.16 0.16
## [2,] 0.83 1.00 0.56 0.60 0.67 0.19 0.22
## [3,] 0.43 0.56 1.00 0.49 0.45 0.15 0.34
## [4,] 0.62 0.60 0.49 1.00 0.64 0.12 0.53
## [5,] 0.59 0.67 0.45 0.64 1.00 0.38 0.57
## [6,] 0.16 0.19 0.15 0.12 0.38 1.00 0.28
## [7,] 0.16 0.22 0.34 0.53 0.57 0.28 1.00

Use the principal function

pc <- principal(X,7,rotate="none")
pc
## Principal Components Analysis
## Call: principal(r = X, nfactors = 7, rotate = "none")
## Standardized loadings (pattern matrix) based upon correlation matrix
##             PC1   PC2   PC3   PC4   PC5   PC6   PC7 h2      u2 com
## rating     0.80 -0.42  0.24 -0.18  0.08  0.19  0.22  1 2.2e-16 2.3
## complaints 0.85 -0.36  0.21  0.00 -0.16  0.11 -0.27  1 2.1e-15 1.8
## privileges 0.68 -0.10 -0.17  0.70  0.00 -0.04  0.07  1 6.7e-16 2.2
## learning   0.83 -0.05 -0.30 -0.19  0.40 -0.16 -0.07  1 2.0e-15 2.0
## raises     0.86  0.19  0.04 -0.19 -0.32 -0.27  0.09  1 1.0e-15 1.8
## critical   0.36  0.64  0.65  0.12  0.17  0.01 -0.02  1 4.4e-16 2.8
## advance    0.58  0.61 -0.46 -0.09 -0.08  0.26  0.00  1 1.8e-15 3.3
## 
##                        PC1  PC2  PC3  PC4  PC5  PC6  PC7
## SS loadings           3.72 1.14 0.85 0.61 0.32 0.22 0.14
## Proportion Var        0.53 0.16 0.12 0.09 0.05 0.03 0.02
## Cumulative Var        0.53 0.69 0.81 0.90 0.95 0.98 1.00
## Proportion Explained  0.53 0.16 0.12 0.09 0.05 0.03 0.02
## Cumulative Proportion 0.53 0.69 0.81 0.90 0.95 0.98 1.00
## 
## Mean item complexity =  2.3
## Test of the hypothesis that 7 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0 
##  with the empirical chi square  0  with prob <  NA 
## 
## Fit based upon off diagonal values = 1

But, this is not very helpful

However, if we take just the first or first 2 pcs, we get a nice description

pc1 <- principal(X)
pc1
## Principal Components Analysis
## Call: principal(r = X)
## Standardized loadings (pattern matrix) based upon correlation matrix
##             PC1   h2   u2 com
## rating     0.80 0.63 0.37   1
## complaints 0.85 0.72 0.28   1
## privileges 0.68 0.47 0.53   1
## learning   0.83 0.68 0.32   1
## raises     0.86 0.74 0.26   1
## critical   0.36 0.13 0.87   1
## advance    0.58 0.34 0.66   1
## 
##                 PC1
## SS loadings    3.72
## Proportion Var 0.53
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 component is sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.13 
##  with the empirical chi square  21.28  with prob <  0.095 
## 
## Fit based upon off diagonal values = 0.92

Statistics is model fitting

Data = Model + Residual

Our model is one principal component, the data is the correlation matrix, how well does it fit?

residual = R - pc1$loading %*% t(pc1$loading)
round(residual,2)
##            rating complaints privileges learning raises critical advance
## rating       0.37       0.15      -0.12    -0.03  -0.10    -0.13   -0.31
## complaints   0.15       0.28      -0.02    -0.10  -0.06    -0.12   -0.27
## privileges  -0.12      -0.02       0.53    -0.07  -0.14    -0.10   -0.06
## learning    -0.03      -0.10      -0.07     0.32  -0.07    -0.18    0.05
## raises      -0.10      -0.06      -0.14    -0.07   0.26     0.07    0.07
## critical    -0.13      -0.12      -0.10    -0.18   0.07     0.87    0.07
## advance     -0.31      -0.27      -0.06     0.05   0.07     0.07    0.66
#or
residuals(pc1)   #the residuals function shows the residuals
##            ratng cmpln prvlg lrnng raiss crtcl advnc
## rating      0.37                                    
## complaints  0.15  0.28                              
## privileges -0.12 -0.02  0.53                        
## learning   -0.03 -0.10 -0.07  0.32                  
## raises     -0.10 -0.06 -0.14 -0.07  0.26            
## critical   -0.13 -0.12 -0.10 -0.18  0.07  0.87      
## advance    -0.31 -0.27 -0.06  0.05  0.07  0.07  0.66

Factor Analysis fits the correlations/covariances of the off diagonal

Components analysis fits the entir matrix \[R \approx CC' \] but factor analysis fits the common part of the matrix and guesses on the diagonal \[R \approx F F' + U2 \]

A very simple example (created using sim.congeneric)

First create a matrix using a simulation function

library(psych)
R <- sim.congeneric(load =c(.9,.8,.7,.6))
lowerMat(R)
##    V1   V2   V3   V4  
## V1 1.00               
## V2 0.72 1.00          
## V3 0.63 0.56 1.00     
## V4 0.54 0.48 0.42 1.00

Can we approximate this (yes, think about your multiplication tables)

R <- sim.congeneric(load =c(.9,.8,.7,.6))
lowerMat(R)
##    V1   V2   V3   V4  
## V1 1.00               
## V2 0.72 1.00          
## V3 0.63 0.56 1.00     
## V4 0.54 0.48 0.42 1.00
f1 <- fa(R)  #use the factor analysis function in `psych`
f1
## Factor Analysis using method =  minres
## Call: fa(r = R)
## Standardized loadings (pattern matrix) based upon correlation matrix
##    MR1   h2   u2 com
## V1 0.9 0.81 0.19   1
## V2 0.8 0.64 0.36   1
## V3 0.7 0.49 0.51   1
## V4 0.6 0.36 0.64   1
## 
##                 MR1
## SS loadings    2.30
## Proportion Var 0.57
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  6  and the objective function was  1.65
## The degrees of freedom for the model are 2  and the objective function was  0 
## 
## The root mean square of the residuals (RMSR) is  0 
## The df corrected root mean square of the residuals is  0 
## 
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    MR1
## Correlation of (regression) scores with factors   0.94
## Multiple R square of scores with factors          0.88
## Minimum correlation of possible factor scores     0.77
#look at the factor model
f1$model
##           V1        V2        V3        V4
## V1 0.8099996 0.7199991 0.6300007 0.5399999
## V2 0.7199991 0.6399987 0.5600002 0.4799996
## V3 0.6300007 0.5600002 0.4900014 0.4200006
## V4 0.5399999 0.4799996 0.4200006 0.3600001
#examine residuals  (Data - Model)

residuals(f1)
##    V1   V2   V3   V4  
## V1 0.19               
## V2 0.00 0.36          
## V3 0.00 0.00 0.51     
## V4 0.00 0.00 0.00 0.64

Simulate some more data using ‘sim.item’

First lets look at the function

sim.item

Now run the function with 12 variables

my.data <- sim.item(12) #use the defaults
describe(my.data)
##     vars   n  mean   sd median trimmed  mad   min  max range  skew kurtosis   se
## V1     1 500 -0.02 1.00  -0.04   -0.01 1.02 -2.83 3.32  6.15 -0.04     0.03 0.04
## V2     2 500 -0.03 0.99   0.04    0.00 0.96 -2.78 2.33  5.12 -0.20    -0.24 0.04
## V3     3 500 -0.02 1.03  -0.04   -0.04 1.02 -2.88 3.13  6.01  0.14    -0.22 0.05
## V4     4 500  0.00 0.99  -0.02   -0.01 1.01 -2.92 3.72  6.63  0.08     0.16 0.04
## V5     5 500  0.03 1.04   0.05    0.02 0.99 -3.29 3.45  6.74 -0.01     0.21 0.05
## V6     6 500 -0.01 1.03  -0.03   -0.01 1.06 -2.81 2.96  5.77 -0.01    -0.28 0.05
## V7     7 500 -0.01 1.00  -0.03   -0.01 1.00 -3.03 3.73  6.75  0.04     0.13 0.04
## V8     8 500  0.03 1.01  -0.03    0.02 1.00 -2.87 3.39  6.25  0.14    -0.07 0.05
## V9     9 500 -0.02 1.03   0.00   -0.02 1.06 -2.89 3.59  6.48  0.09     0.03 0.05
## V10   10 500 -0.01 0.98   0.02    0.00 0.97 -2.56 2.70  5.27 -0.03    -0.37 0.04
## V11   11 500  0.03 1.06   0.01    0.01 1.07 -3.10 3.00  6.11  0.14    -0.08 0.05
## V12   12 500  0.00 1.00   0.00    0.00 1.02 -2.82 2.72  5.54  0.02    -0.17 0.04
corPlot(my.data) #show the structure

f2 <- fa(my.data,2) #extract two factors
## Loading required namespace: GPArotation
f2  #show then
## Factor Analysis using method =  minres
## Call: fa(r = my.data, nfactors = 2)
## Standardized loadings (pattern matrix) based upon correlation matrix
##       MR1   MR2   h2   u2 com
## V1   0.02 -0.58 0.34 0.66   1
## V2  -0.02 -0.54 0.29 0.71   1
## V3   0.05 -0.64 0.41 0.59   1
## V4   0.59  0.03 0.35 0.65   1
## V5   0.56 -0.02 0.32 0.68   1
## V6   0.66  0.02 0.43 0.57   1
## V7   0.02  0.62 0.38 0.62   1
## V8   0.05  0.56 0.31 0.69   1
## V9   0.00  0.60 0.36 0.64   1
## V10 -0.60  0.01 0.36 0.64   1
## V11 -0.55  0.01 0.31 0.69   1
## V12 -0.64  0.02 0.41 0.59   1
## 
##                        MR1  MR2
## SS loadings           2.17 2.08
## Proportion Var        0.18 0.17
## Cumulative Var        0.18 0.35
## Proportion Explained  0.51 0.49
## Cumulative Proportion 0.51 1.00
## 
##  With factor correlations of 
##       MR1   MR2
## MR1  1.00 -0.06
## MR2 -0.06  1.00
## 
## Mean item complexity =  1
## Test of the hypothesis that 2 factors are sufficient.
## 
## The degrees of freedom for the null model are  66  and the objective function was  2.45 with Chi Square of  1212.71
## The degrees of freedom for the model are 43  and the objective function was  0.1 
## 
## The root mean square of the residuals (RMSR) is  0.03 
## The df corrected root mean square of the residuals is  0.03 
## 
## The harmonic number of observations is  500 with the empirical chi square  43.96  with prob <  0.43 
## The total number of observations was  500  with Likelihood Chi Square =  51.16  with prob <  0.18 
## 
## Tucker Lewis Index of factoring reliability =  0.989
## RMSEA index =  0.019  and the 90 % confidence intervals are  0 0.038
## BIC =  -216.07
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy             
##                                                    MR1  MR2
## Correlation of (regression) scores with factors   0.88 0.87
## Multiple R square of scores with factors          0.78 0.76
## Minimum correlation of possible factor scores     0.55 0.53
fa.plot(f2)   #another way of showing structure

## Lets try some real data

Use the bfi data set

describe(bfi[1:5]) #always describe first
##    vars    n mean   sd median trimmed  mad min max range  skew kurtosis   se
## A1    1 2784 2.41 1.41      2    2.23 1.48   1   6     5  0.83    -0.31 0.03
## A2    2 2773 4.80 1.17      5    4.98 1.48   1   6     5 -1.12     1.05 0.02
## A3    3 2774 4.60 1.30      5    4.79 1.48   1   6     5 -1.00     0.44 0.02
## A4    4 2781 4.70 1.48      5    4.93 1.48   1   6     5 -1.03     0.04 0.03
## A5    5 2784 4.56 1.26      5    4.71 1.48   1   6     5 -0.85     0.16 0.02
R <- lowerCor(bfi[1:5])  #just the first 5 from the bfi
##    A1    A2    A3    A4    A5   
## A1  1.00                        
## A2 -0.34  1.00                  
## A3 -0.27  0.49  1.00            
## A4 -0.15  0.34  0.36  1.00      
## A5 -0.18  0.39  0.50  0.31  1.00
f1 <- fa(R)
f1 
## Factor Analysis using method =  minres
## Call: fa(r = R)
## Standardized loadings (pattern matrix) based upon correlation matrix
##      MR1   h2   u2 com
## A1 -0.37 0.14 0.86   1
## A2  0.68 0.46 0.54   1
## A3  0.75 0.57 0.43   1
## A4  0.48 0.23 0.77   1
## A5  0.61 0.38 0.62   1
## 
##                 MR1
## SS loadings    1.78
## Proportion Var 0.36
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  10  and the objective function was  0.93
## The degrees of freedom for the model are 5  and the objective function was  0.03 
## 
## The root mean square of the residuals (RMSR) is  0.04 
## The df corrected root mean square of the residuals is  0.05 
## 
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy             
##                                                    MR1
## Correlation of (regression) scores with factors   0.87
## Multiple R square of scores with factors          0.76
## Minimum correlation of possible factor scores     0.53
residuals(f1)
##    A1    A2    A3    A4    A5   
## A1  0.86                        
## A2 -0.09  0.54                  
## A3  0.02 -0.03  0.43            
## A4  0.03  0.01  0.00  0.77      
## A5  0.05 -0.03  0.04  0.01  0.62

show the item content as well

fa.lookup(f1,dictionary = bfi.dictionary[1:2])
##      MR1 com   h2 ItemLabel                                      Item
## A3  0.75   1 0.57    q_1206               Know how to comfort others.
## A2  0.68   1 0.46    q_1162         Inquire about others' well-being.
## A5  0.61   1 0.38    q_1419                 Make people feel at ease.
## A4  0.48   1 0.23    q_1364                            Love children.
## A1 -0.37   1 0.14     q_146 Am indifferent to the feelings of others.

##But what if we included more items?

f1a <- fa(bfi[1:10]) 
f1a
## Factor Analysis using method =  minres
## Call: fa(r = bfi[1:10])
## Standardized loadings (pattern matrix) based upon correlation matrix
##      MR1    h2   u2 com
## A1 -0.26 0.065 0.93   1
## A2  0.52 0.269 0.73   1
## A3  0.53 0.285 0.71   1
## A4  0.48 0.229 0.77   1
## A5  0.49 0.239 0.76   1
## C1  0.41 0.168 0.83   1
## C2  0.50 0.248 0.75   1
## C3  0.47 0.221 0.78   1
## C4 -0.52 0.273 0.73   1
## C5 -0.51 0.260 0.74   1
## 
##                 MR1
## SS loadings    2.26
## Proportion Var 0.23
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  45  and the objective function was  2.03 with Chi Square of  5664.89
## The degrees of freedom for the model are 35  and the objective function was  0.82 
## 
## The root mean square of the residuals (RMSR) is  0.12 
## The df corrected root mean square of the residuals is  0.14 
## 
## The harmonic number of observations is  2762 with the empirical chi square  3714.94  with prob <  0 
## The total number of observations was  2800  with Likelihood Chi Square =  2299.01  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  0.482
## RMSEA index =  0.152  and the 90 % confidence intervals are  0.147 0.157
## BIC =  2021.21
## Fit based upon off diagonal values = 0.77
## Measures of factor score adequacy             
##                                                    MR1
## Correlation of (regression) scores with factors   0.87
## Multiple R square of scores with factors          0.75
## Minimum correlation of possible factor scores     0.50
residuals(f1a)
##    A1    A2    A3    A4    A5    C1    C2    C3    C4    C5   
## A1  0.93                                                      
## A2 -0.21  0.73                                                
## A3 -0.13  0.21  0.71                                          
## A4 -0.02  0.09  0.10  0.77                                    
## A5 -0.06  0.14  0.24  0.07  0.76                              
## C1  0.13 -0.12 -0.12 -0.11 -0.08  0.83                        
## C2  0.14 -0.12 -0.12 -0.01 -0.13  0.22  0.75                  
## C3  0.10 -0.05 -0.12 -0.09 -0.10  0.12  0.12  0.78            
## C4  0.00  0.13  0.16  0.10  0.13 -0.13 -0.12 -0.09  0.73      
## C5 -0.08  0.14  0.12  0.00  0.08 -0.04 -0.04 -0.10  0.21  0.74

Extract another factor

(The algebra is the same: \(R \approx F F' + U2\))

f2 <- fa(bfi[1:10],nfactors=2)  #ask for two factors
f2
## Factor Analysis using method =  minres
## Call: fa(r = bfi[1:10], nfactors = 2)
## Standardized loadings (pattern matrix) based upon correlation matrix
##      MR1   MR2   h2   u2 com
## A1  0.08 -0.41 0.15 0.85 1.1
## A2  0.01  0.68 0.46 0.54 1.0
## A3 -0.03  0.76 0.56 0.44 1.0
## A4  0.14  0.44 0.25 0.75 1.2
## A5  0.03  0.60 0.37 0.63 1.0
## C1  0.57 -0.06 0.31 0.69 1.0
## C2  0.64 -0.01 0.40 0.60 1.0
## C3  0.54  0.03 0.31 0.69 1.0
## C4 -0.65  0.00 0.42 0.58 1.0
## C5 -0.56 -0.06 0.34 0.66 1.0
## 
##                        MR1  MR2
## SS loadings           1.80 1.78
## Proportion Var        0.18 0.18
## Cumulative Var        0.18 0.36
## Proportion Explained  0.50 0.50
## Cumulative Proportion 0.50 1.00
## 
##  With factor correlations of 
##      MR1  MR2
## MR1 1.00 0.32
## MR2 0.32 1.00
## 
## Mean item complexity =  1
## Test of the hypothesis that 2 factors are sufficient.
## 
## The degrees of freedom for the null model are  45  and the objective function was  2.03 with Chi Square of  5664.89
## The degrees of freedom for the model are 26  and the objective function was  0.17 
## 
## The root mean square of the residuals (RMSR) is  0.04 
## The df corrected root mean square of the residuals is  0.05 
## 
## The harmonic number of observations is  2762 with the empirical chi square  397.07  with prob <  5e-68 
## The total number of observations was  2800  with Likelihood Chi Square =  468.37  with prob <  1.2e-82 
## 
## Tucker Lewis Index of factoring reliability =  0.864
## RMSEA index =  0.078  and the 90 % confidence intervals are  0.072 0.084
## BIC =  261.99
## Fit based upon off diagonal values = 0.98
## Measures of factor score adequacy             
##                                                    MR1  MR2
## Correlation of (regression) scores with factors   0.86 0.88
## Multiple R square of scores with factors          0.74 0.77
## Minimum correlation of possible factor scores     0.49 0.54
resid(f2)   #these are much smaller
##    A1    A2    A3    A4    A5    C1    C2    C3    C4    C5   
## A1  0.85                                                      
## A2 -0.08  0.54                                                
## A3  0.02 -0.02  0.44                                          
## A4  0.03  0.00  0.00  0.75                                    
## A5  0.05 -0.03  0.05  0.01  0.63                              
## C1  0.03  0.01  0.02 -0.04  0.03  0.69                        
## C2  0.04  0.00  0.01  0.06 -0.02  0.08  0.60                  
## C3  0.02  0.05 -0.01 -0.04  0.00  0.01  0.01  0.69            
## C4  0.10  0.00  0.02  0.04  0.02  0.02  0.03  0.02  0.58      
## C5  0.00  0.04  0.01 -0.05 -0.01  0.07  0.07 -0.02  0.10  0.66
fa.lookup(f2,dictionary=bfi.dictionary[1:2]) #note that we sort the loadings
##      MR1   MR2  com   h2 ItemLabel                                      Item
## C4 -0.65  0.00 1.00 0.42     q_626           Do things in a half-way manner.
## C2  0.64 -0.01 1.00 0.40     q_530     Continue until everything is perfect.
## C1  0.57 -0.06 1.02 0.31     q_124                   Am exacting in my work.
## C5 -0.56 -0.06 1.02 0.34    q_1949                            Waste my time.
## C3  0.54  0.03 1.01 0.31     q_619            Do things according to a plan.
## A3 -0.03  0.76 1.00 0.56    q_1206               Know how to comfort others.
## A2  0.01  0.68 1.00 0.46    q_1162         Inquire about others' well-being.
## A5  0.03  0.60 1.00 0.37    q_1419                 Make people feel at ease.
## A4  0.14  0.44 1.22 0.25    q_1364                            Love children.
## A1  0.08 -0.41 1.08 0.15     q_146 Am indifferent to the feelings of others.

Lets try a different data set

The msq represents 75 mood and emotion items. But we will take just 12 of them

affect <- c("active", "energetic",  "wakeful","sleepy", "tired", "drowsy",
 "jittery", "fearful", "tense",
       "placid", "calm", "at.rest")
f2 <- fa(msq[affect],2)
f2   #but "Alabama need not come first, don't show in alpha order"
## Factor Analysis using method =  minres
## Call: fa(r = msq[affect], nfactors = 2)
## Standardized loadings (pattern matrix) based upon correlation matrix
##             MR1   MR2   h2   u2 com
## active    -0.64  0.07 0.42 0.58 1.0
## energetic -0.71  0.06 0.51 0.49 1.0
## wakeful   -0.74 -0.03 0.55 0.45 1.0
## sleepy     0.83  0.03 0.69 0.31 1.0
## tired      0.82  0.03 0.68 0.32 1.0
## drowsy     0.81  0.01 0.66 0.34 1.0
## jittery   -0.23  0.54 0.34 0.66 1.4
## fearful    0.05  0.35 0.12 0.88 1.1
## tense     -0.01  0.62 0.38 0.62 1.0
## placid     0.18 -0.37 0.17 0.83 1.4
## calm      -0.04 -0.64 0.42 0.58 1.0
## at.rest   -0.21 -0.55 0.35 0.65 1.3
## 
##                        MR1  MR2
## SS loadings           3.62 1.65
## Proportion Var        0.30 0.14
## Cumulative Var        0.30 0.44
## Proportion Explained  0.69 0.31
## Cumulative Proportion 0.69 1.00
## 
##  With factor correlations of 
##       MR1   MR2
## MR1  1.00 -0.01
## MR2 -0.01  1.00
## 
## Mean item complexity =  1.1
## Test of the hypothesis that 2 factors are sufficient.
## 
## The degrees of freedom for the null model are  66  and the objective function was  5.55 with Chi Square of  21580.28
## The degrees of freedom for the model are 43  and the objective function was  1.49 
## 
## The root mean square of the residuals (RMSR) is  0.09 
## The df corrected root mean square of the residuals is  0.11 
## 
## The harmonic number of observations is  3864 with the empirical chi square  4284.87  with prob <  0 
## The total number of observations was  3896  with Likelihood Chi Square =  5780.7  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  0.591
## RMSEA index =  0.185  and the 90 % confidence intervals are  0.181 0.189
## BIC =  5425.18
## Fit based upon off diagonal values = 0.92
## Measures of factor score adequacy             
##                                                    MR1  MR2
## Correlation of (regression) scores with factors   0.95 0.85
## Multiple R square of scores with factors          0.90 0.72
## Minimum correlation of possible factor scores     0.80 0.44
fa.sort(f2)  #sorted by size of loading
## Factor Analysis using method =  minres
## Call: fa(r = msq[affect], nfactors = 2)
## Standardized loadings (pattern matrix) based upon correlation matrix
##             MR1   MR2   h2   u2 com
## sleepy     0.83  0.03 0.69 0.31 1.0
## tired      0.82  0.03 0.68 0.32 1.0
## drowsy     0.81  0.01 0.66 0.34 1.0
## wakeful   -0.74 -0.03 0.55 0.45 1.0
## energetic -0.71  0.06 0.51 0.49 1.0
## active    -0.64  0.07 0.42 0.58 1.0
## calm      -0.04 -0.64 0.42 0.58 1.0
## tense     -0.01  0.62 0.38 0.62 1.0
## at.rest   -0.21 -0.55 0.35 0.65 1.3
## jittery   -0.23  0.54 0.34 0.66 1.4
## placid     0.18 -0.37 0.17 0.83 1.4
## fearful    0.05  0.35 0.12 0.88 1.1
## 
##                        MR1  MR2
## SS loadings           3.62 1.65
## Proportion Var        0.30 0.14
## Cumulative Var        0.30 0.44
## Proportion Explained  0.69 0.31
## Cumulative Proportion 0.69 1.00
## 
##  With factor correlations of 
##       MR1   MR2
## MR1  1.00 -0.01
## MR2 -0.01  1.00
## 
## Mean item complexity =  1.1
## Test of the hypothesis that 2 factors are sufficient.
## 
## The degrees of freedom for the null model are  66  and the objective function was  5.55 with Chi Square of  21580.28
## The degrees of freedom for the model are 43  and the objective function was  1.49 
## 
## The root mean square of the residuals (RMSR) is  0.09 
## The df corrected root mean square of the residuals is  0.11 
## 
## The harmonic number of observations is  3864 with the empirical chi square  4284.87  with prob <  0 
## The total number of observations was  3896  with Likelihood Chi Square =  5780.7  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  0.591
## RMSEA index =  0.185  and the 90 % confidence intervals are  0.181 0.189
## BIC =  5425.18
## Fit based upon off diagonal values = 0.92
## Measures of factor score adequacy             
##                                                    MR1  MR2
## Correlation of (regression) scores with factors   0.95 0.85
## Multiple R square of scores with factors          0.90 0.72
## Minimum correlation of possible factor scores     0.80 0.44
fa.plot(f2,labels=affect)  # show the results graphically

How many factors are in the data?

Although there is no right answer, there are many ways of finding this. Parallel analysis compares the solution to random solutions, nfactors trys multiple solutions.

fa.parallel(msq[1:100,affect])   #run a parallel analysis

## Parallel analysis suggests that the number of factors =  3  and the number of components =  3

Now try the nfactors aproach

nfactors(msq[affect])

## 
## Number of factors
## Call: vss(x = x, n = n, rotate = rotate, diagonal = diagonal, fm = fm, 
##     n.obs = n.obs, plot = FALSE, title = title, use = use, cor = cor)
## VSS complexity 1 achieves a maximimum of 0.75  with  2  factors
## VSS complexity 2 achieves a maximimum of 0.89  with  5  factors
## The Velicer MAP achieves a minimum of 0.05  with  4  factors 
## Empirical BIC achieves a minimum of  -177.96  with  4  factors
## Sample Size adjusted BIC achieves a minimum of  -67.28  with  4  factors
## 
## Statistics by number of factors 
##    vss1 vss2   map dof   chisq     prob sqresid  fit  RMSEA  BIC SABIC complex  eChisq    SRMR
## 1  0.61 0.00 0.068  54 8.8e+03  0.0e+00    10.2 0.61 0.2042 8377  8549     1.0 1.3e+04 1.6e-01
## 2  0.75 0.80 0.069  43 5.8e+03  0.0e+00     5.3 0.80 0.1851 5425  5562     1.1 4.3e+03 9.2e-02
## 3  0.62 0.86 0.052  33 1.2e+03 1.3e-232     3.2 0.88 0.0956  935  1040     1.5 7.1e+02 3.7e-02
## 4  0.64 0.88 0.051  24 5.5e+01  3.2e-04     2.4 0.91 0.0182 -144   -67     1.4 2.0e+01 6.3e-03
## 5  0.64 0.89 0.079  16 2.6e+01  5.6e-02     1.8 0.93 0.0126 -106   -56     1.4 5.7e+00 3.3e-03
## 6  0.65 0.88 0.115   9 8.6e+00  4.7e-01     1.9 0.93 0.0000  -66   -37     1.5 1.9e+00 1.9e-03
## 7  0.64 0.87 0.214   3 3.4e+00  3.4e-01     2.1 0.92 0.0057  -21   -12     1.5 5.7e-01 1.1e-03
## 8  0.64 0.87 0.301  -2 1.6e+00       NA     1.5 0.94     NA   NA    NA     1.6 1.9e-01 6.1e-04
## 9  0.64 0.88 0.407  -6 3.3e-07       NA     2.1 0.92     NA   NA    NA     1.5 6.8e-08 3.6e-07
## 10 0.64 0.88 0.501  -9 1.3e-06       NA     2.1 0.92     NA   NA    NA     1.5 4.0e-07 8.9e-07
## 11 0.64 0.88 1.000 -11 2.4e-08       NA     2.1 0.92     NA   NA    NA     1.5 4.2e-09 9.0e-08
## 12 0.64 0.88    NA -12 2.4e-08       NA     2.1 0.92     NA   NA    NA     1.5 4.2e-09 9.0e-08
##     eCRMS  eBIC
## 1  0.1749 12419
## 2  0.1135  3962
## 3  0.0527   442
## 4  0.0105  -178
## 5  0.0068  -127
## 6  0.0053   -72
## 7  0.0049   -24
## 8      NA    NA
## 9      NA    NA
## 10     NA    NA
## 11     NA    NA
## 12     NA    NA

Categorical variables underestimate the correlation

The tetrachoric and polychoric correlation

Correlations are based upon the assumption of continous x and y. But if they are categorical or even worse, dichotomous, this will underestimate the “true” correlation. Consider what happens when we dichotomize a bivariate normal:

draw.tetra()

We can estimate the underlying “latent” correlation using the tetrachoric or polychoric functions. We can then do the analysis using these correlations.

f2p <- fa(msq[affect],2, cor="poly")
fa.sort(f2p)  #compare this to what we got before
## Factor Analysis using method =  minres
## Call: fa(r = msq[affect], nfactors = 2, cor = "poly")
## Standardized loadings (pattern matrix) based upon correlation matrix
##             MR1   MR2   h2   u2 com
## sleepy     0.87  0.05 0.76 0.24 1.0
## tired      0.86  0.05 0.74 0.26 1.0
## drowsy     0.86  0.03 0.73 0.27 1.0
## wakeful   -0.80  0.00 0.64 0.36 1.0
## energetic -0.79  0.10 0.63 0.37 1.0
## active    -0.69  0.11 0.50 0.50 1.1
## tense      0.01  0.79 0.63 0.37 1.0
## jittery   -0.24  0.65 0.50 0.50 1.3
## calm      -0.06 -0.62 0.38 0.62 1.0
## fearful    0.10  0.57 0.33 0.67 1.1
## at.rest   -0.24 -0.53 0.32 0.68 1.4
## placid     0.18 -0.32 0.14 0.86 1.6
## 
##                        MR1  MR2
## SS loadings           4.14 2.18
## Proportion Var        0.35 0.18
## Cumulative Var        0.35 0.53
## Proportion Explained  0.66 0.34
## Cumulative Proportion 0.66 1.00
## 
##  With factor correlations of 
##       MR1   MR2
## MR1  1.00 -0.04
## MR2 -0.04  1.00
## 
## Mean item complexity =  1.1
## Test of the hypothesis that 2 factors are sufficient.
## 
## The degrees of freedom for the null model are  66  and the objective function was  8.04 with Chi Square of  31295.72
## The degrees of freedom for the model are 43  and the objective function was  2.34 
## 
## The root mean square of the residuals (RMSR) is  0.1 
## The df corrected root mean square of the residuals is  0.13 
## 
## The harmonic number of observations is  3864 with the empirical chi square  5394.07  with prob <  0 
## The total number of observations was  3896  with Likelihood Chi Square =  9118.18  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  0.554
## RMSEA index =  0.233  and the 90 % confidence intervals are  0.229 0.237
## BIC =  8762.67
## Fit based upon off diagonal values = 0.93
## Measures of factor score adequacy             
##                                                    MR1  MR2
## Correlation of (regression) scores with factors   0.96 0.90
## Multiple R square of scores with factors          0.93 0.81
## Minimum correlation of possible factor scores     0.86 0.62
fa.congruence(f2p,f2)   #factor congruence coefficients
##       MR1  MR2
## MR1  1.00 0.00
## MR2 -0.03 0.98