Structural Equation Modeling in R

Structural equation models combine measurement models (e.g., reliability) with structural models (e.g., regression). The sem package, developed by John Fox, allows for some basic structural equation models. To use it, add the sem package by using the package manager.

Structural Equation Modeling may be thought of as regression corrected for attentuation. The sem package developed by John Fox uses the RAM path notation of Jack McCardle and is fairly straightforward. The examples in the package are quite straightforward. A text book, such as John Loehlin's Latent Variable Models (4th Edition) is helpful in understanding the algorithm.

For much more detail on using R to do structural equation modeling, see the course notes for sem (primarily using R) available at the syllabus for my sem course. Also see John Fox's notes that he has prepared as a brief description of SEM techniques as an appendix to his statistics text.

```#Loehlin problem 2.5
obs.var2.5 = c('Ach1',  'Ach2',  'Amb1',  'Amb2',  'Amb3')
R.prob2.5 = matrix(c(
1.00 ,  .60  , .30,  .20,   .20,
.60,  1.00,   .20,   .30,   .10,
.30,   .20,  1.00,   .70,   .60 ,
.20,   .30,   .70,  1.00,   .50,
.20,   .10,   .60,  .50,  1.00), ncol=5,byrow=TRUE)

#correlated factors structure (ambition <-> Achievement)
model2.5=matrix(c(
'Ambit ->  Amb1',      'a', NA,
'Ambit -> Amb2' ,      'b', NA,
'Ambit -> Amb3' ,      'c', NA,
'Achieve -> Ach1',     'd', NA,
'Achieve -> Ach2',     'e', NA,
'Ambit <-> Achieve',   'f', NA,
'Amb1 <-> Amb1' ,      'u', NA,
'Amb2 <-> Amb2' ,      'v', NA,
'Amb3 <-> Amb3' ,      'w', NA,
'Ach1 <-> Ach1' ,      'x', NA,
'Ach2 <-> Ach2' ,      'y', NA,
'Achieve <-> Achieve',  NA, 1,
'Ambit <-> Ambit',      NA, 1),
ncol=3, byrow=TRUE)
sem2.5= sem(model2.5,R.prob2.5,60, obs.var2.5)
summary(sem2.5,digits=3)

Model Chisquare =  9.74   Df =  4 Pr(>Chisq) = 0.0450
Goodness-of-fit index =  0.964
RMSEA index =  0.120   90 % CI: (0.0164, 0.219)
BIC =  -15.1

Normalized Residuals
Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
-5.77e-01 -3.78e-02 -2.04e-06  4.85e-03  3.87e-05  1.13e+00

Parameter Estimates
Estimate Std Error z value Pr(>|z|)
a    0.920    0.0924   9.966 0.00e+00    Amb1 <--- Ambit
b    0.761    0.0955   7.974 1.55e-15    Amb2 <--- Ambit
c    0.652    0.0965   6.753 1.45e-11    Amb3 <--- Ambit
d    0.879    0.1762   4.986 6.16e-07  Ach1 <--- Achieve
e    0.683    0.1509   4.525 6.03e-06  Ach2 <--- Achieve
f    0.356    0.1138   3.127 1.76e-03 Achieve <--> Ambit
u    0.153    0.0982   1.557 1.20e-01     Amb1 <--> Amb1
v    0.420    0.0898   4.679 2.88e-06     Amb2 <--> Amb2
w    0.575    0.0949   6.061 1.35e-09     Amb3 <--> Amb3
x    0.228    0.2791   0.816 4.15e-01     Ach1 <--> Ach1
y    0.534    0.1837   2.905 3.67e-03     Ach2 <--> Ach2

Iterations =  26
>  #causal structure with errors in achievement
>   #ambition -> achievement
>
>  model2.51=matrix(c(
+ 	'Ambit ->  Amb1',      'a', NA,
+ 	'Ambit -> Amb2' ,      'b', NA,
+ 	'Ambit -> Amb3' ,      'c', NA,
+ 	'Achieve -> Ach1',     'd', NA,
+ 	'Achieve -> Ach2',     'e', NA,
+ 	'Ambit -> Achieve',   'f', NA,
+ 	'Amb1 <-> Amb1' ,      'u', NA,
+ 	'Amb2 <-> Amb2' ,      'v', NA,
+ 	'Amb3 <-> Amb3' ,      'w', NA,
+ 	'Ach1 <-> Ach1' ,      'x', NA,
+ 	'Ach2 <-> Ach2' ,      'y', NA,
+ 	'Achieve <-> Achieve',  NA, 1,
+ 	'Ambit <-> Ambit',      NA, 1),
+ 	ncol=3, byrow=TRUE)

>
>  sem2.51= sem(model2.51,R.prob2.5,100, obs.var2.5)
>  summary(sem2.51,digits=3)

Model Chisquare =  9.74   Df =  4 Pr(>Chisq) = 0.0450
Goodness-of-fit index =  0.964
RMSEA index =  0.120   90 % CI: (0.0164, 0.219)
BIC =  -15.1

Normalized Residuals
Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
-5.77e-01 -3.78e-02 -6.33e-06  4.85e-03  3.06e-05  1.13e+00

Parameter Estimates
Estimate Std Error z value Pr(>|z|)
a    0.920    0.0924   9.966 0.00e+00    Amb1 <--- Ambit
b    0.761    0.0955   7.974 1.55e-15    Amb2 <--- Ambit
c    0.652    0.0965   6.753 1.45e-11    Amb3 <--- Ambit
d    0.821    0.1781   4.613 3.98e-06  Ach1 <--- Achieve
e    0.638    0.1343   4.752 2.01e-06  Ach2 <--- Achieve
f    0.381    0.1394   2.731 6.32e-03 Achieve <--- Ambit
u    0.153    0.0982   1.557 1.20e-01     Amb1 <--> Amb1
v    0.420    0.0898   4.679 2.88e-06     Amb2 <--> Amb2
w    0.575    0.0949   6.061 1.35e-09     Amb3 <--> Amb3
x    0.228    0.2791   0.816 4.15e-01     Ach1 <--> Ach1
y    0.534    0.1837   2.906 3.67e-03     Ach2 <--> Ach2

Iterations =  27
> #causal structure with errors in achievement
>   #ambition -> achievement
>
>  model2.52=matrix(c(
+ 	'Ambit ->  Amb1',      'a', NA,
+ 	'Ambit -> Amb2' ,      'b', NA,
+ 	'Ambit -> Amb3' ,      'c', NA,
+ 	'Achieve -> Ach1',     'd', NA,
+ 	'Achieve -> Ach2',     'e', NA,
+ 	'Ambit -> Achieve',   'f', NA,
+ 	'Amb1 <-> Amb1' ,      'u', NA,
+ 	'Amb2 <-> Amb2' ,      'v', NA,
+ 	'Amb3 <-> Amb3' ,      'w', NA,
+ 	'Ach1 <-> Ach1' ,      'x', NA,
+ 	'Ach2 <-> Ach2' ,      'y', NA,
+ 	'Achieve <-> Achieve', 'z', NA,
+ 	'Ambit <-> Ambit',      NA, 1),
+ 	ncol=3, byrow=TRUE)
>
>  sem2.52= sem(model2.52,R.prob2.5,100, obs.var2.5)
Warning message:
Negative parameter variances.
Model is probably underidentified.
in: sem.default(ram = ram, S = S, N = N, param.names = pars, var.names = vars,
>  summary(sem2.52,digits=3)

Model Chisquare =  9.74   Df =  3 Pr(>Chisq) = 0.0209
Goodness-of-fit index =  0.964
RMSEA index =  0.151   90 % CI: (0.0517, 0.261)
BIC =  -8.9

Normalized Residuals
Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
-5.77e-01 -3.78e-02 -1.62e-06  4.85e-03  3.43e-05  1.13e+00

Parameter Estimates
Estimate Std Error z value Pr(>|z|)
a    0.920    0.0924   9.966 0.00e+00      Amb1 <--- Ambit
b    0.761    0.0955   7.974 1.55e-15      Amb2 <--- Ambit
c    0.652    0.0965   6.753 1.45e-11      Amb3 <--- Ambit
d    0.905       NaN     NaN      NaN    Ach1 <--- Achieve
e    0.703       NaN     NaN      NaN    Ach2 <--- Achieve
f    0.346       NaN     NaN      NaN   Achieve <--- Ambit
u    0.153    0.0982   1.557 1.20e-01       Amb1 <--> Amb1
v    0.420    0.0898   4.679 2.88e-06       Amb2 <--> Amb2
w    0.575    0.0949   6.061 1.35e-09       Amb3 <--> Amb3
x    0.228    0.2791   0.816 4.15e-01       Ach1 <--> Ach1
y    0.534    0.1837   2.905 3.67e-03       Ach2 <--> Ach2
z    0.824       NaN     NaN      NaN Achieve <--> Achieve

Iterations =  27

Aliased parameters: d e f z
Warning message:
NaNs produced in: sqrt(diag(object\$cov))
>   #causal structure with errors in achievement
>   #ambition -> achievement
>   #fix achievement to Loehlin answer
>
>  model2.53=matrix(c(
+ 	'Ambit ->  Amb1',      'a', NA,
+ 	'Ambit -> Amb2' ,      'b', NA,
+ 	'Ambit -> Amb3' ,      'c', NA,
+ 	'Achieve -> Ach1',     'd', NA,
+ 	'Achieve -> Ach2',     'e', NA,
+ 	'Ambit -> Achieve',   'f', NA,
+ 	'Amb1 <-> Amb1' ,      'u', NA,
+ 	'Amb2 <-> Amb2' ,      'v', NA,
+ 	'Amb3 <-> Amb3' ,      'w', NA,
+ 	'Ach1 <-> Ach1' ,      'x', NA,
+ 	'Ach2 <-> Ach2' ,      'y', NA,
+ 	'Achieve <-> Achieve', NA, .873,
+ 	'Ambit <-> Ambit',      NA, 1),
+ 	ncol=3, byrow=TRUE)
>
>  sem2.53= sem(model2.53,R.prob2.5,100, obs.var2.5)
>  summary(sem2.53,digits=3)\
Error: syntax error
> summary(sem2.53,digits=3)

Model Chisquare =  9.74   Df =  4 Pr(>Chisq) = 0.0450
Goodness-of-fit index =  0.964
RMSEA index =  0.120   90 % CI: (0.0164, 0.219)
BIC =  -15.1

Normalized Residuals
Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
-5.77e-01 -3.78e-02 -1.14e-06  4.85e-03  3.19e-05  1.13e+00

Parameter Estimates
Estimate Std Error z value Pr(>|z|)
a    0.920    0.0924   9.966 0.00e+00    Amb1 <--- Ambit
b    0.761    0.0955   7.974 1.55e-15    Amb2 <--- Ambit
c    0.652    0.0965   6.753 1.45e-11    Amb3 <--- Ambit
d    0.879    0.1906   4.612 3.98e-06  Ach1 <--- Achieve
e    0.683    0.1437   4.752 2.01e-06  Ach2 <--- Achieve
f    0.356    0.1303   2.731 6.32e-03 Achieve <--- Ambit
u    0.153    0.0982   1.557 1.20e-01     Amb1 <--> Amb1
v    0.420    0.0898   4.679 2.88e-06     Amb2 <--> Amb2
w    0.575    0.0949   6.061 1.35e-09     Amb3 <--> Amb3
x    0.228    0.2791   0.816 4.15e-01     Ach1 <--> Ach1
y    0.534    0.1837   2.906 3.67e-03     Ach2 <--> Ach2

Iterations =  27
>

ÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄ

#Loehlin problem from table 2-12
#Note that version a is a classic example of congeneric measurement.
#Alternatively, this could be thought of as underidentified higher order model

obs.var2.12 = c('a',  'b',  'c',  'd')
R.prob2.12 = matrix(c(
1.00 ,  .30,    .20,   .10,
.30,  1.00,   .20,   .20,
.20,   .20,  1.00,   .30,
.10,   .20,   .30,  1.00),
ncol=4,byrow=TRUE)

model2.12a=matrix(c(
'g ->  a',      'a1', NA,
'g -> b' ,      'b1', NA,
'g -> c' ,      'c1', NA,
'g -> d',     'd1', NA,
'a <-> a',      'e1', NA,
'b <-> b',      'e2', NA,
'c <-> c',      'e3', NA,
'd <-> d',      'e4', NA,
'g <-> g',       NA, 1),
ncol=3, byrow=TRUE)
sem2.12a= sem(model2.12a,R.prob2.12,120, obs.var2.12)
summary(sem2.12a,digits=3)

#a 1 degree of freedom model
model2.12b=matrix(c(
'g ->  a',      'a1', NA,
'g -> b' ,      'b1', NA,
'f -> c' ,      'c1', NA,
'f -> d',     'd1', NA,
'a <-> a',      'e1', NA,
'b <-> b',      'e2', NA,
'c <-> c',      'e3', NA,
'd <-> d',      'e4', NA,
'g <-> g',       NA, 1,
'f <-> f',       NA, 1,
'g <-> f',        'fg', NA),
ncol=3, byrow=TRUE)
sem2.12b= sem(model2.12b,R.prob2.12,120, obs.var2.12)
summary(sem2.12b,digits=3)

#the following higher level model has 0 degrees of freedom
model2.12c=matrix(c(
'g ->  a',      'a1', NA,
'g -> b' ,      'b1', NA,
'f -> c' ,      'c1', NA,
'f -> d',       'd1', NA,
'a <-> a',      'e1', NA,
'b <-> b',      'e2', NA,
'c <-> c',      'e3', NA,
'd <-> d',      'e4', NA,
'g <-> g',       NA, 1,
'f <-> f',       NA, 1,
'h -> g',        'hg', NA,
'h -> f',        NA,1,
'h <-> h',        NA,1),
ncol=3, byrow=TRUE)
sem2.12c= sem(model2.12c,R.prob2.12,120, obs.var2.12)
summary(sem2.12c,digits=3)

ÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄ

#Loehlin problem 2.9

obs.var2.09 = c('w',  'x',  'y',  'z')
R.prob2.09 = matrix(c(
1.00 ,  .40,    .50,   .30,
.40,  1.00,   .55,   .35,
.50,   .55,  1.00,   .40,
.30,   .35,   .40,  1.00),
ncol=4,byrow=TRUE)

model2.09=matrix(c(
'g ->  w',      'a1', NA,
'g -> x' ,      'b1', NA,
'g -> y' ,      'c1', NA,
'g -> z',     'd1', NA,
'w <-> w',      'e1', NA,
'x <-> x',      'e2', NA,
'y <-> y',      'e3', NA,
'z <-> z',      'e4', NA,
'g <-> g',       NA, 1),
ncol=3, byrow=TRUE)
sem2.09= sem(model2.09,R.prob2.09,500, obs.var2.09)
summary(sem2.09,digits=3)

obs.var2.09b = c('w',  'x',  'y',  'z')

R.prob2.09b = matrix(c(
1.00 ,  .40,    .50,   .30,
.40,  1.00,   .55,   .35,
.50,   .55,  1.00,   .40,
.30,   .35,   .40,  1.00),
ncol=4,byrow=TRUE)

model2.09b=matrix(c(
'g ->  w',      NA,1,
'g -> x' ,      'b1', NA,
'g -> y' ,      'c1', NA,
'g -> z',     'd1', NA,
'w <-> w',      'e1', NA,
'x <-> x',      'e2', NA,
'y <-> y',      'e3', NA,
'z <-> z',      'e4', NA,
'g <-> g',       'e',NA),
ncol=3, byrow=TRUE)

sem2.09b= sem(model2.09b,R.prob2.09b,500, obs.var2.09b)
summary(sem2.09b,digits=3)

```
part of a short guide to R
Version of February 15, 2007
William Revelle Department of Psychology
Northwestern University