A collection of useful functions to do psychometrics and data analysis. To load all of these, use the source command:
source("http://personality-project.org/r/useful.r") #the procedures listed below
source("http://personality-project.org/r/vss.r") #the Very Simple Structure package
#define a function to calculate coefficient alpha
alpha.scale=function (x,y) #find coefficient alpha given a scale and a data.frame of the items in the scale
{
n=length(y) #number of variables
Vi=sum(diag(var(y,na.rm=TRUE))) #sum of item variance
Vt=var(x,na.rm=TRUE) #total test variance
((Vt-Vi)/Vt)*(n/(n-1))} #alpha
#general descriptive statistics
describe <- function (x, digits = 2,na.rm=TRUE) #basic stats after dropping non-numeric data
{ #first, define a local function
valid <- function(x) {
return(sum(!is.na(x)))
}
if (is.vector(x) ) #do it for vectors or
{
stats = matrix(rep(NA,6),ncol=6) #create a temporary array
stats[1, 1] <- mean(x, na.rm=na.rm )
stats[1, 2] <- median(x,na.rm=na.rm )
stats[1, 3] <- min(x, na.rm=na.rm )
stats[1, 4] <- max(x, na.rm=na.rm )
stats[1, 5] <- skew(x,na.rm=na.rm )
stats[1, 6] <- valid(x )
len <- 1
}
else {len = dim(x)[2] #do it for matrices or data.frames
stats = matrix(rep(NA,len*6),ncol=6) #create a temporary array
for (i in 1:len) {
if (is.numeric(x[,i])) { #just do this for numeric data
stats[i, 1] <- mean(x[,i], na.rm=na.rm )
stats[i, 2] <- median(x[,i],na.rm=na.rm )
stats[i, 3] <- min(x[,i], na.rm=na.rm )
stats[i, 4] <- max(x[,i], na.rm=na.rm )
stats[i, 5] <- skew(x[,i],na.rm=na.rm )
stats[i, 6] <- valid(x[,i] )
}
}
}
temp <- data.frame(n = stats[,6],mean = stats[,1], sd = sd(x,na.rm=TRUE), median = stats[,
2],min= stats[,3],max=stats[,4], range=stats[,4]-stats[,3],skew = stats[, 5])
answer <- round(data.frame(V=seq(1:len),temp, se = temp$sd/sqrt(temp$n)), digits)
return(answer)
}
# find basic summary statistics by groups #adapted from "Kickstarting R"
summ.stats <- function (x,y) { #data are x, grouping variable is y
means <- tapply(x, y, mean, na.rm=TRUE)
sds <- tapply(x,y, sd, na.rm = TRUE)
valid <- function (x) {return(sum(!is.na(x)) )}
ns <- tapply(x,y, valid )
se=sds/sqrt(ns)
answer <-data.frame(mean=means,sd=sds,se=se,n=ns)
}
#error bars in a two dimensional plot
error.crosses <-function (x,y,z) # x and y are data frame with z points
{for (i in 1:z)
{xcen <- x$mean[i]
ycen <- y$mean[i]
xse <- x$se[i]
yse <- y$se[i]
arrows(xcen-xse,ycen,xcen+xse,ycen,length=.2, angle = 90, code=3,col = par("fg"), lty = NULL, lwd = par("lwd"), xpd = NULL)
arrows(xcen,ycen-yse,xcen,ycen+yse,length=.2, angle = 90, code=3,col = par("fg"), lty = NULL, lwd = par("lwd"), xpd = NULL)
}
}
#examples
# plot(mPA,mNA,pch=symb[condit],cex=4.5,col=colors[condit],bg=colors[condit],main="Postive vs. Negative Affect",xlim=c(-1,1.5),ylim=c(-1,1.5),xlab="Positive Affect",ylab="Negative Affect")
#error.crosses(paf.stats,naf.stats,4) #put in x and y error bars!
skew
skew= function (x, na.rm = FALSE)
{
if (na.rm) x <- x[!is.na(x)] #remove missing values
sum((x - mean(x))^3)/(length(x) * sd(x)^3) #calculate skew
}
Put scatter plots below diagonal, histograms on the diagonal, correlations above the diagonal. The scatter plots can be best fitted with a loess smooth (smooth=TRUE is the default), and the correlations can be scaled (scale=TRUE) by the size of the correlation.
These functions are adapted (stolen?) from the help.cor examples.
#some fancy graphics -- adapted from help.cor
panel.cor.scale <- function(x, y, digits=2, prefix="", cex.cor)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r = (cor(x, y,use="pairwise"))
txt <- format(c(r, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex * abs(r))
}
panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r = (cor(x, y,use="pairwise"))
txt <- format(c(r, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex )
}
panel.hist <- function(x, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks; nB <- length(breaks)
y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col="cyan", ...)
}
pairs.panels <- function (x,y,smooth=TRUE,scale=FALSE)
{if (smooth ){
if (scale) {
pairs(x,diag.panel=panel.hist,upper.panel=panel.cor.scale,lower.panel=panel.smooth)
}
else {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor,lower.panel=panel.smooth)
} #else {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor,lower.panel=panel.smooth)
}
else #smooth is not true
{ if (scale) {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor.scale)
} else {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor) }
} #end of else (smooth)
} #end of function
multi.hist <- function(x) {nvar <- dim(x)[2] #number of variables
nsize=trunc(sqrt(nvar))+1 #size of graphic
old.par <- par(no.readonly = TRUE) # all par settings which can be changed
par(mfrow=c(nsize,nsize)) #set new graphic parameters
for (i in 1:nvar) {
name=names(x)[i] #get the names for the variables
hist(x[,i],main=name,xlab=name) } #draw the histograms for each variable
on.exit(par(old.par)) #set the graphic parameters back to the original
}
#function to replace upper triangle of matrix with unattenuated correlations, and the diagonal with reliabilities
#input is a correlation matrix and a vector of reliabilities
correct.cor <- function(x,y) { n=dim(x)[1]
{ x[1,1] <- y[1,1]
for (i in 2:n) {
x[i,i] <- y[1,i] #put reliabilities on the diagonal
k=i-1
for (j in 1:k) {
x[j,i] <- x[j,i]/sqrt(y[1,i]*y[1,j]) } #fix the upper triangular part of the matrix
}
return(x) }}
#difference of dependent (paired) correlations (following Steiger,J., 1980, Tests for comparing elements of a correlation matrix. Psychological Bulletin, 87, 245-251)
paired.r <- function(xy,xz,yz,n) {
diff <- xy-xz
determin=1-xy*xy - xz*xz - yz*yz + 2*xy*xz*yz
av=(xy+xz)/2
cube= (1-yz)*(1-yz)*(1-yz)
t2 = diff * sqrt((n-1)*(1+yz)/(((2*(n-1)/(n-3))*determin+av*av*cube)))
return(t2)
}
fisherz <- function(rho) {0.5*log((1+rho)/(1-rho)) } #converts r to z
#quick demonstration of a random walk process
#to show range of variability over trials and replications
#the "confidence lines" represent 2 sd of theoretical sum
random.walk <- function(length=200,n=20,effect=0) {
num <- seq(1:length)
colors <- rainbow(n) #select colors
x <- cumsum(rnorm(length,effect))/num
plot(x,ylim = c(-2.5,2.5),typ="b",col=colors[1],main="Sample means as function of sample size",ylab="sample mean",xlab="sample size")
for (i in 2:n) {
x <- cumsum(rnorm(length,effect))/num #find the next line
points(x,col=colors[i],typ="b") #draw it
}
curve(2/sqrt(x),1,length,101,add=TRUE) #upper confidence region
curve(-2/sqrt(x),1,length,101,add=TRUE) #lower confidence region
text(length/2,2,"effect size is " )
text(length/2, 1.8,eval(effect))
}