This page is out of date.
To get the most recent version of the psych package, use the installer in R. The functions listed below were written before the psych package was released.
For the adventurous, the development version of the psych package may be found at http://personality-project.org/r/src. But it is better to just install the released version from CRAN.
Useful snippits of R code
A collection of useful functions to do psychometrics and data analysis. To load all of these, use the source command:
source("http://personality-project.org/r/useful.r") #the procedures listed below source("http://personality-project.org/r/vss.r") #the Very Simple Structure package
- alpha.scale(scale,item.df) (find coefficient alpha from a scale vector and dataframe of the items)
- describe(x,digits=2) (give means, sd, n, se, and skew for a data frame)
- summ.stats (summary statistics for a data frame broken down by a categorical variable) -- also can use by(x,y,describe)
- error.crosses (error bars in two space)
- skew (x)
- pairs.panels(x) (fancy correlations splom below diagonal, correlations above)
- multi.hist(x) (plot multiple histograms for a data frame)
- correct.cor (rmatrix,reliabilities) (given a correlation matrix and a vector of reliabilities, put the reliabilities on the diagonal and report unattenuated correlations above the diagonal.
- paired.r(r12,r13,r23,n) test for the difference between two correlated correlations (returns t value)
- fisherz(r) convert Pearson r to fisher z
- VSS (and VSS.plot and VSS.simulate) found in vss.r
Alpha
Find Cronbach's coefficient alpha for a scale formed from a data.frame of items.#define a function to calculate coefficient alpha alpha.scale=function (x,y) #find coefficient alpha given a scale and a data.frame of the items in the scale { n=length(y) #number of variables Vi=sum(diag(var(y,na.rm=TRUE))) #sum of item variance Vt=var(x,na.rm=TRUE) #total test variance ((Vt-Vi)/Vt)*(n/(n-1))} #alpha
describe
Report basic descriptive statistics (mean, median, standard deviation, skew, and standard error) for each variable in a data frame. When combined with the by function, this can also describe data by groups. Defaults to 2 decimals places.#general descriptive statistics describe <- function (x, digits = 2,na.rm=TRUE) #basic stats after dropping non-numeric data { #first, define a local function valid <- function(x) { return(sum(!is.na(x))) } if (is.vector(x) ) #do it for vectors or { stats = matrix(rep(NA,6),ncol=6) #create a temporary array stats[1, 1] <- mean(x, na.rm=na.rm ) stats[1, 2] <- median(x,na.rm=na.rm ) stats[1, 3] <- min(x, na.rm=na.rm ) stats[1, 4] <- max(x, na.rm=na.rm ) stats[1, 5] <- skew(x,na.rm=na.rm ) stats[1, 6] <- valid(x ) len <- 1 } else {len = dim(x)[2] #do it for matrices or data.frames stats = matrix(rep(NA,len*6),ncol=6) #create a temporary array for (i in 1:len) { if (is.numeric(x[,i])) { #just do this for numeric data stats[i, 1] <- mean(x[,i], na.rm=na.rm ) stats[i, 2] <- median(x[,i],na.rm=na.rm ) stats[i, 3] <- min(x[,i], na.rm=na.rm ) stats[i, 4] <- max(x[,i], na.rm=na.rm ) stats[i, 5] <- skew(x[,i],na.rm=na.rm ) stats[i, 6] <- valid(x[,i] ) } } } temp <- data.frame(n = stats[,6],mean = stats[,1], sd = sd(x,na.rm=TRUE), median = stats[, 2],min= stats[,3],max=stats[,4], range=stats[,4]-stats[,3],skew = stats[, 5]) answer <- round(data.frame(V=seq(1:len),temp, se = temp$sd/sqrt(temp$n)), digits) return(answer) }
summ.stats
# find basic summary statistics by groups #adapted from "Kickstarting R" summ.stats <- function (x,y) { #data are x, grouping variable is y means <- tapply(x, y, mean, na.rm=TRUE) sds <- tapply(x,y, sd, na.rm = TRUE) valid <- function (x) {return(sum(!is.na(x)) )} ns <- tapply(x,y, valid ) se=sds/sqrt(ns) answer <-data.frame(mean=means,sd=sds,se=se,n=ns) }
error.crosses
#error bars in a two dimensional plot error.crosses <-function (x,y,z) # x and y are data frame with z points {for (i in 1:z) {xcen <- x$mean[i] ycen <- y$mean[i] xse <- x$se[i] yse <- y$se[i] arrows(xcen-xse,ycen,xcen+xse,ycen,length=.2, angle = 90, code=3,col = par("fg"), lty = NULL, lwd = par("lwd"), xpd = NULL) arrows(xcen,ycen-yse,xcen,ycen+yse,length=.2, angle = 90, code=3,col = par("fg"), lty = NULL, lwd = par("lwd"), xpd = NULL) } } #examples # plot(mPA,mNA,pch=symb[condit],cex=4.5,col=colors[condit],bg=colors[condit],main="Postive vs. Negative Affect",xlim=c(-1,1.5),ylim=c(-1,1.5),xlab="Positive Affect",ylab="Negative Affect") #error.crosses(paf.stats,naf.stats,4) #put in x and y error bars!skew
skew= function (x, na.rm = FALSE) { if (na.rm) x <- x[!is.na(x)] #remove missing values sum((x - mean(x))^3)/(length(x) * sd(x)^3) #calculate skew }
fancy correlation plots
Put scatter plots below diagonal, histograms on the diagonal, correlations above the diagonal. The scatter plots can be best fitted with a loess smooth (smooth=TRUE is the default), and the correlations can be scaled (scale=TRUE) by the size of the correlation.
These functions are adapted (stolen?) from the help.cor examples.
#some fancy graphics -- adapted from help.cor panel.cor.scale <- function(x, y, digits=2, prefix="", cex.cor) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r = (cor(x, y,use="pairwise")) txt <- format(c(r, 0.123456789), digits=digits)[1] txt <- paste(prefix, txt, sep="") if(missing(cex.cor)) cex <- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex = cex * abs(r)) } panel.cor <- function(x, y, digits=2, prefix="", cex.cor) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r = (cor(x, y,use="pairwise")) txt <- format(c(r, 0.123456789), digits=digits)[1] txt <- paste(prefix, txt, sep="") if(missing(cex.cor)) cex <- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex = cex ) } panel.hist <- function(x, ...) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(usr[1:2], 0, 1.5) ) h <- hist(x, plot = FALSE) breaks <- h$breaks; nB <- length(breaks) y <- h$counts; y <- y/max(y) rect(breaks[-nB], 0, breaks[-1], y, col="cyan", ...) } pairs.panels <- function (x,y,smooth=TRUE,scale=FALSE) {if (smooth ){ if (scale) { pairs(x,diag.panel=panel.hist,upper.panel=panel.cor.scale,lower.panel=panel.smooth) } else {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor,lower.panel=panel.smooth) } #else {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor,lower.panel=panel.smooth) } else #smooth is not true { if (scale) {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor.scale) } else {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor) } } #end of else (smooth) } #end of function
multi.hist
multi.hist <- function(x) {nvar <- dim(x)[2] #number of variables nsize=trunc(sqrt(nvar))+1 #size of graphic old.par <- par(no.readonly = TRUE) # all par settings which can be changed par(mfrow=c(nsize,nsize)) #set new graphic parameters for (i in 1:nvar) { name=names(x)[i] #get the names for the variables hist(x[,i],main=name,xlab=name) } #draw the histograms for each variable on.exit(par(old.par)) #set the graphic parameters back to the original }
correct.cor
#function to replace upper triangle of matrix with unattenuated correlations, and the diagonal with reliabilities #input is a correlation matrix and a vector of reliabilities correct.cor <- function(x,y) { n=dim(x)[1] { x[1,1] <- y[1,1] for (i in 2:n) { x[i,i] <- y[1,i] #put reliabilities on the diagonal k=i-1 for (j in 1:k) { x[j,i] <- x[j,i]/sqrt(y[1,i]*y[1,j]) } #fix the upper triangular part of the matrix } return(x) }}
paired.r
#difference of dependent (paired) correlations (following Steiger,J., 1980, Tests for comparing elements of a correlation matrix. Psychological Bulletin, 87, 245-251) paired.r <- function(xy,xz,yz,n) { diff <- xy-xz determin=1-xy*xy - xz*xz - yz*yz + 2*xy*xz*yz av=(xy+xz)/2 cube= (1-yz)*(1-yz)*(1-yz) t2 = diff * sqrt((n-1)*(1+yz)/(((2*(n-1)/(n-3))*determin+av*av*cube))) return(t2) }
fisherz
fisherz <- function(rho) {0.5*log((1+rho)/(1-rho)) } #converts r to z#quick demonstration of a random walk process #to show range of variability over trials and replications #the "confidence lines" represent 2 sd of theoretical sum random.walk <- function(length=200,n=20,effect=0) { num <- seq(1:length) colors <- rainbow(n) #select colors x <- cumsum(rnorm(length,effect))/num plot(x,ylim = c(-2.5,2.5),typ="b",col=colors[1],main="Sample means as function of sample size",ylab="sample mean",xlab="sample size") for (i in 2:n) { x <- cumsum(rnorm(length,effect))/num #find the next line points(x,col=colors[i],typ="b") #draw it } curve(2/sqrt(x),1,length,101,add=TRUE) #upper confidence region curve(-2/sqrt(x),1,length,101,add=TRUE) #lower confidence region text(length/2,2,"effect size is " ) text(length/2, 1.8,eval(effect)) }
part of a short guide to R
Version of December 31, 2004
William Revelle
Department of Psychology
Northwestern University