This page is out of date.

To get the most recent version of the psych package, use the installer in R. The functions listed below were written before the psych package was released.

For the adventurous, the development version of the psych package may be found at http://personality-project.org/r/src. But it is better to just install the released version from CRAN.

Useful snippits of R code

A collection of useful functions to do psychometrics and data analysis. To load all of these, use the source command:

 

source("http://personality-project.org/r/useful.r")    #the procedures listed below
source("http://personality-project.org/r/vss.r")       #the Very Simple Structure package

alpha.scale(scale,item.df) (find coefficient alpha from a scale vector and dataframe of the items)
describe(x,digits=2) (give means, sd, n, se, and skew for a data frame)
summ.stats (summary statistics for a data frame broken down by a categorical variable) -- also can use by(x,y,describe)
error.crosses (error bars in two space)
skew (x)
pairs.panels(x) (fancy correlations splom below diagonal, correlations above)
multi.hist(x) (plot multiple histograms for a data frame)
correct.cor (rmatrix,reliabilities) (given a correlation matrix and a vector of reliabilities, put the reliabilities on the diagonal and report unattenuated correlations above the diagonal.
paired.r(r12,r13,r23,n) test for the difference between two correlated correlations (returns t value)
fisherz(r) convert Pearson r to fisher z
VSS (and VSS.plot and VSS.simulate) found in vss.r

Alpha

Find Cronbach's coefficient alpha for a scale formed from a data.frame of items.


#define a function to calculate coefficient alpha
alpha.scale=function (x,y)   #find coefficient alpha given a scale and a data.frame of the items in the scale
	{
		n=length(y)          #number of variables
		Vi=sum(diag(var(y,na.rm=TRUE)))     #sum of item variance
		Vt=var(x,na.rm=TRUE)                #total test variance
		((Vt-Vi)/Vt)*(n/(n-1))}              #alpha

describe

Report basic descriptive statistics (mean, median, standard deviation, skew, and standard error) for each variable in a data frame. When combined with the by function, this can also describe data by groups. Defaults to 2 decimals places.



#general descriptive statistics 
describe <- function (x, digits = 2,na.rm=TRUE)   #basic stats after dropping non-numeric data
{                                   #first, define a local function
    valid <- function(x) {      
        return(sum(!is.na(x)))
   		 }
    if (is.vector(x) )          #do it for vectors or 
    	{
    	    stats = matrix(rep(NA,6),ncol=6)    #create a temporary array
    
			stats[1, 1] <-  mean(x, na.rm=na.rm )
			stats[1, 2] <-  median(x,na.rm=na.rm  )
			stats[1, 3] <-  min(x, na.rm=na.rm )
			stats[1, 4] <-  max(x, na.rm=na.rm )
			stats[1, 5] <-  skew(x,na.rm=na.rm  )
			stats[1, 6] <-  valid(x )
        	len <- 1   
    	}
    	
    	
    else  {len = dim(x)[2]     #do it for matrices or data.frames
    
    stats = matrix(rep(NA,len*6),ncol=6)    #create a temporary array
    for (i in 1:len) {
    	if (is.numeric(x[,i])) {   #just do this for numeric data
			stats[i, 1] <-  mean(x[,i], na.rm=na.rm )
			stats[i, 2] <-  median(x[,i],na.rm=na.rm  )
			stats[i, 3] <-  min(x[,i], na.rm=na.rm )
			stats[i, 4] <-  max(x[,i], na.rm=na.rm )
			stats[i, 5] <-  skew(x[,i],na.rm=na.rm  )
			stats[i, 6] <-  valid(x[,i] )
        		}
    	}
    	}
   temp <-  data.frame(n = stats[,6],mean = stats[,1], sd = sd(x,na.rm=TRUE), median = stats[, 
        2],min= stats[,3],max=stats[,4], range=stats[,4]-stats[,3],skew = stats[, 5])
    answer <-  round(data.frame(V=seq(1:len),temp, se = temp$sd/sqrt(temp$n)),  digits)
     return(answer)
}

summ.stats


# find basic summary statistics by groups      #adapted from "Kickstarting R"
 summ.stats <- function (x,y) {               #data are x, grouping variable is y
  means <- tapply(x, y, mean, na.rm=TRUE)
  sds  <- tapply(x,y, sd, na.rm = TRUE)
  valid <- function (x) {return(sum(!is.na(x)) )}
  ns <- tapply(x,y, valid )
  se=sds/sqrt(ns)
  answer <-data.frame(mean=means,sd=sds,se=se,n=ns)
  }

error.crosses


#error bars in a two dimensional plot            
error.crosses <-function (x,y,z)  # x  and y are data frame with z points
    {for (i in 1:z)  
    	{xcen <- x$mean[i]
    	 ycen <- y$mean[i]
    	 xse  <- x$se[i]
    	 yse <-  y$se[i]
    	 arrows(xcen-xse,ycen,xcen+xse,ycen,length=.2, angle = 90, code=3,col = par("fg"), lty = NULL, lwd = par("lwd"), xpd = NULL)
    	 arrows(xcen,ycen-yse,xcen,ycen+yse,length=.2, angle = 90, code=3,col = par("fg"), lty = NULL, lwd = par("lwd"), xpd = NULL)
    	}	
   }

#examples
# plot(mPA,mNA,pch=symb[condit],cex=4.5,col=colors[condit],bg=colors[condit],main="Postive vs. Negative Affect",xlim=c(-1,1.5),ylim=c(-1,1.5),xlab="Positive Affect",ylab="Negative Affect")
    
#error.crosses(paf.stats,naf.stats,4)     #put in x and y error bars! 


skew


 skew= function (x, na.rm = FALSE) 
 {
    if (na.rm)    x <- x[!is.na(x)]             #remove missing values
    sum((x - mean(x))^3)/(length(x) * sd(x)^3)  #calculate skew   
 }

fancy correlation plots

Put scatter plots below diagonal, histograms on the diagonal, correlations above the diagonal. The scatter plots can be best fitted with a loess smooth (smooth=TRUE is the default), and the correlations can be scaled (scale=TRUE) by the size of the correlation.

These functions are adapted (stolen?) from the help.cor examples.

 
     #some fancy graphics   -- adapted from help.cor
 panel.cor.scale <- function(x, y, digits=2, prefix="", cex.cor)
     {
         usr <- par("usr"); on.exit(par(usr))
         par(usr = c(0, 1, 0, 1))
         r = (cor(x, y,use="pairwise"))
         txt <- format(c(r, 0.123456789), digits=digits)[1]
         txt <- paste(prefix, txt, sep="")
         if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
         text(0.5, 0.5, txt, cex = cex * abs(r))
     }
     
  panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
     {
         usr <- par("usr"); on.exit(par(usr))
         par(usr = c(0, 1, 0, 1))
         r = (cor(x, y,use="pairwise"))
         txt <- format(c(r, 0.123456789), digits=digits)[1]
         txt <- paste(prefix, txt, sep="")
         if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
         text(0.5, 0.5, txt, cex = cex )
     }
     
     panel.hist <- function(x, ...)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(usr[1:2], 0, 1.5) )
    h <- hist(x, plot = FALSE)
    breaks <- h$breaks; nB <- length(breaks)
    y <- h$counts; y <- y/max(y)
    rect(breaks[-nB], 0, breaks[-1], y, col="cyan", ...)
}

pairs.panels <- function (x,y,smooth=TRUE,scale=FALSE) 
      {if (smooth ){
         if (scale) {
             pairs(x,diag.panel=panel.hist,upper.panel=panel.cor.scale,lower.panel=panel.smooth)
                    }
                    else {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor,lower.panel=panel.smooth)
                    } #else  {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor,lower.panel=panel.smooth)
                    }
                   
                    else      #smooth is not true
             { if (scale) {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor.scale)
               } else  {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor) }
            } #end of else (smooth)
         
      }   #end of function

multi.hist

  

  
  multi.hist <- function(x) {nvar <- dim(x)[2]  #number of variables
     nsize=trunc(sqrt(nvar))+1   #size of graphic
     old.par <- par(no.readonly = TRUE) # all par settings which can be changed
     par(mfrow=c(nsize,nsize))       #set new graphic parameters
     for (i in 1:nvar) {
     name=names(x)[i]                #get the names for the variables
     hist(x[,i],main=name,xlab=name) }  #draw the histograms for each variable
     on.exit(par(old.par))   #set the graphic parameters back to the original
     }

correct.cor


#function to replace upper triangle of matrix with unattenuated correlations, and the diagonal with reliabilities
#input is a correlation matrix and a vector of reliabilities
     
 correct.cor <- function(x,y) { n=dim(x)[1]   
        { x[1,1] <- y[1,1]
        for (i in 2:n) {
           x[i,i] <- y[1,i]   #put reliabilities on the diagonal
           k=i-1
           for (j in 1:k) {
              x[j,i] <- x[j,i]/sqrt(y[1,i]*y[1,j])  }   #fix the upper triangular part of the matrix
        
             }
           return(x)  }}

paired.r


 #difference of dependent (paired) correlations (following Steiger,J., 1980, Tests for comparing elements of a correlation matrix. Psychological Bulletin, 87, 245-251)
 paired.r <- function(xy,xz,yz,n) {
       diff <- xy-xz
       determin=1-xy*xy - xz*xz - yz*yz + 2*xy*xz*yz
       av=(xy+xz)/2
       cube= (1-yz)*(1-yz)*(1-yz)
       t2 = diff * sqrt((n-1)*(1+yz)/(((2*(n-1)/(n-3))*determin+av*av*cube)))
       return(t2)
        }

fisherz


  fisherz <- function(rho)  {0.5*log((1+rho)/(1-rho)) }   #converts r to z

#quick demonstration of a random walk process #to show range of variability over trials and replications #the "confidence lines" represent 2 sd of theoretical sum random.walk <- function(length=200,n=20,effect=0) { num <- seq(1:length) colors <- rainbow(n) #select colors x <- cumsum(rnorm(length,effect))/num plot(x,ylim = c(-2.5,2.5),typ="b",col=colors[1],main="Sample means as function of sample size",ylab="sample mean",xlab="sample size") for (i in 2:n) { x <- cumsum(rnorm(length,effect))/num #find the next line points(x,col=colors[i],typ="b") #draw it } curve(2/sqrt(x),1,length,101,add=TRUE) #upper confidence region curve(-2/sqrt(x),1,length,101,add=TRUE) #lower confidence region text(length/2,2,"effect size is " ) text(length/2, 1.8,eval(effect)) }

part of a short guide to R
Version of December 31, 2004
William Revelle
Department of Psychology
Northwestern University