Useful snippits of R code

A collection of useful functions to do psychometrics and data analysis. To load all of these, use the source command:

 

source("http://personality-project.org/r/useful.r")    #the procedures listed below
source("http://personality-project.org/r/vss.r")       #the Very Simple Structure package

  1. alpha.scale(scale,item.df) (find coefficient alpha from a scale vector and dataframe of the items)
  2. describe(x,digits=2) (give means, sd, n, se, and skew for a data frame)
  3. summ.stats (summary statistics for a data frame broken down by a categorical variable) -- also can use by(x,y,describe)
  4. error.crosses (error bars in two space)
  5. skew (x)
  6. pairs.panels(x) (fancy correlations splom below diagonal, correlations above)
  7. multi.hist(x) (plot multiple histograms for a data frame)
  8. correct.cor (rmatrix,reliabilities) (given a correlation matrix and a vector of reliabilities, put the reliabilities on the diagonal and report unattenuated correlations above the diagonal.
  9. paired.r(r12,r13,r23,n) test for the difference between two correlated correlations (returns t value)
  10. fisherz(r) convert Pearson r to fisher z
  11. VSS (and VSS.plot and VSS.simulate) found in vss.r

Alpha

Find Cronbach's coefficient alpha for a scale formed from a data.frame of items.

#define a function to calculate coefficient alpha
alpha.scale=function (x,y)   #find coefficient alpha given a scale and a data.frame of the items in the scale
	{
		n=length(y)          #number of variables
		Vi=sum(diag(var(y,na.rm=TRUE)))     #sum of item variance
		Vt=var(x,na.rm=TRUE)                #total test variance
		((Vt-Vi)/Vt)*(n/(n-1))}              #alpha


describe

Report basic descriptive statistics (mean, median, standard deviation, skew, and standard error) for each variable in a data frame. When combined with the by function, this can also describe data by groups. Defaults to 2 decimals places.


#general descriptive statistics 
describe <- function (x, digits = 2,na.rm=TRUE)   #basic stats after dropping non-numeric data
{                                   #first, define a local function
    valid <- function(x) {      
        return(sum(!is.na(x)))
   		 }
    if (is.vector(x) )          #do it for vectors or 
    	{
    	    stats = matrix(rep(NA,6),ncol=6)    #create a temporary array
    
			stats[1, 1] <-  mean(x, na.rm=na.rm )
			stats[1, 2] <-  median(x,na.rm=na.rm  )
			stats[1, 3] <-  min(x, na.rm=na.rm )
			stats[1, 4] <-  max(x, na.rm=na.rm )
			stats[1, 5] <-  skew(x,na.rm=na.rm  )
			stats[1, 6] <-  valid(x )
        	len <- 1   
    	}
    	
    	
    else  {len = dim(x)[2]     #do it for matrices or data.frames
    
    stats = matrix(rep(NA,len*6),ncol=6)    #create a temporary array
    for (i in 1:len) {
    	if (is.numeric(x[,i])) {   #just do this for numeric data
			stats[i, 1] <-  mean(x[,i], na.rm=na.rm )
			stats[i, 2] <-  median(x[,i],na.rm=na.rm  )
			stats[i, 3] <-  min(x[,i], na.rm=na.rm )
			stats[i, 4] <-  max(x[,i], na.rm=na.rm )
			stats[i, 5] <-  skew(x[,i],na.rm=na.rm  )
			stats[i, 6] <-  valid(x[,i] )
        		}
    	}
    	}
   temp <-  data.frame(n = stats[,6],mean = stats[,1], sd = sd(x,na.rm=TRUE), median = stats[, 
        2],min= stats[,3],max=stats[,4], range=stats[,4]-stats[,3],skew = stats[, 5])
    answer <-  round(data.frame(V=seq(1:len),temp, se = temp$sd/sqrt(temp$n)),  digits)
     return(answer)
}

     
    

summ.stats


# find basic summary statistics by groups      #adapted from "Kickstarting R"
 summ.stats <- function (x,y) {               #data are x, grouping variable is y
  means <- tapply(x, y, mean, na.rm=TRUE)
  sds  <- tapply(x,y, sd, na.rm = TRUE)
  valid <- function (x) {return(sum(!is.na(x)) )}
  ns <- tapply(x,y, valid )
  se=sds/sqrt(ns)
  answer <-data.frame(mean=means,sd=sds,se=se,n=ns)
  }
  
  
 

error.crosses


#error bars in a two dimensional plot            
error.crosses <-function (x,y,z)  # x  and y are data frame with z points
    {for (i in 1:z)  
    	{xcen <- x$mean[i]
    	 ycen <- y$mean[i]
    	 xse  <- x$se[i]
    	 yse <-  y$se[i]
    	 arrows(xcen-xse,ycen,xcen+xse,ycen,length=.2, angle = 90, code=3,col = par("fg"), lty = NULL, lwd = par("lwd"), xpd = NULL)
    	 arrows(xcen,ycen-yse,xcen,ycen+yse,length=.2, angle = 90, code=3,col = par("fg"), lty = NULL, lwd = par("lwd"), xpd = NULL)
    	}	
   }

#examples
# plot(mPA,mNA,pch=symb[condit],cex=4.5,col=colors[condit],bg=colors[condit],main="Postive vs. Negative Affect",xlim=c(-1,1.5),ylim=c(-1,1.5),xlab="Positive Affect",ylab="Negative Affect")
    
#error.crosses(paf.stats,naf.stats,4)     #put in x and y error bars! 


skew

skew= function (x, na.rm = FALSE) { if (na.rm) x <- x[!is.na(x)] #remove missing values sum((x - mean(x))^3)/(length(x) * sd(x)^3) #calculate skew }

fancy correlation plots

Put scatter plots below diagonal, histograms on the diagonal, correlations above the diagonal. The scatter plots can be best fitted with a loess smooth (smooth=TRUE is the default), and the correlations can be scaled (scale=TRUE) by the size of the correlation.

These functions are adapted (stolen?) from the help.cor examples.

 
     #some fancy graphics   -- adapted from help.cor
 panel.cor.scale <- function(x, y, digits=2, prefix="", cex.cor)
     {
         usr <- par("usr"); on.exit(par(usr))
         par(usr = c(0, 1, 0, 1))
         r = (cor(x, y,use="pairwise"))
         txt <- format(c(r, 0.123456789), digits=digits)[1]
         txt <- paste(prefix, txt, sep="")
         if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
         text(0.5, 0.5, txt, cex = cex * abs(r))
     }
     
  panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
     {
         usr <- par("usr"); on.exit(par(usr))
         par(usr = c(0, 1, 0, 1))
         r = (cor(x, y,use="pairwise"))
         txt <- format(c(r, 0.123456789), digits=digits)[1]
         txt <- paste(prefix, txt, sep="")
         if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
         text(0.5, 0.5, txt, cex = cex )
     }
     
     panel.hist <- function(x, ...)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(usr[1:2], 0, 1.5) )
    h <- hist(x, plot = FALSE)
    breaks <- h$breaks; nB <- length(breaks)
    y <- h$counts; y <- y/max(y)
    rect(breaks[-nB], 0, breaks[-1], y, col="cyan", ...)
}

pairs.panels <- function (x,y,smooth=TRUE,scale=FALSE) 
      {if (smooth ){
         if (scale) {
             pairs(x,diag.panel=panel.hist,upper.panel=panel.cor.scale,lower.panel=panel.smooth)
                    }
                    else {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor,lower.panel=panel.smooth)
                    } #else  {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor,lower.panel=panel.smooth)
                    }
                   
                    else      #smooth is not true
             { if (scale) {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor.scale)
               } else  {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor) }
            } #end of else (smooth)
         
      }   #end of function

     

multi.hist

  

  
  multi.hist <- function(x) {nvar <- dim(x)[2]  #number of variables
     nsize=trunc(sqrt(nvar))+1   #size of graphic
     old.par <- par(no.readonly = TRUE) # all par settings which can be changed
     par(mfrow=c(nsize,nsize))       #set new graphic parameters
     for (i in 1:nvar) {
     name=names(x)[i]                #get the names for the variables
     hist(x[,i],main=name,xlab=name) }  #draw the histograms for each variable
     on.exit(par(old.par))   #set the graphic parameters back to the original
     }

correct.cor


#function to replace upper triangle of matrix with unattenuated correlations, and the diagonal with reliabilities
#input is a correlation matrix and a vector of reliabilities
     
 correct.cor <- function(x,y) { n=dim(x)[1]   
        { x[1,1] <- y[1,1]
        for (i in 2:n) {
           x[i,i] <- y[1,i]   #put reliabilities on the diagonal
           k=i-1
           for (j in 1:k) {
              x[j,i] <- x[j,i]/sqrt(y[1,i]*y[1,j])  }   #fix the upper triangular part of the matrix
        
             }
           return(x)  }}
          
     

paired.r


 #difference of dependent (paired) correlations (following Steiger,J., 1980, Tests for comparing elements of a correlation matrix. Psychological Bulletin, 87, 245-251)
 paired.r <- function(xy,xz,yz,n) {
       diff <- xy-xz
       determin=1-xy*xy - xz*xz - yz*yz + 2*xy*xz*yz
       av=(xy+xz)/2
       cube= (1-yz)*(1-yz)*(1-yz)
       t2 = diff * sqrt((n-1)*(1+yz)/(((2*(n-1)/(n-3))*determin+av*av*cube)))
       return(t2)
        }
     
   

fisherz


  fisherz <- function(rho)  {0.5*log((1+rho)/(1-rho)) }   #converts r to z     

 
#quick demonstration of a random walk process #to show range of variability over trials and replications #the "confidence lines" represent 2 sd of theoretical sum random.walk <- function(length=200,n=20,effect=0) { num <- seq(1:length) colors <- rainbow(n) #select colors x <- cumsum(rnorm(length,effect))/num plot(x,ylim = c(-2.5,2.5),typ="b",col=colors[1],main="Sample means as function of sample size",ylab="sample mean",xlab="sample size") for (i in 2:n) { x <- cumsum(rnorm(length,effect))/num #find the next line points(x,col=colors[i],typ="b") #draw it } curve(2/sqrt(x),1,length,101,add=TRUE) #upper confidence region curve(-2/sqrt(x),1,length,101,add=TRUE) #lower confidence region text(length/2,2,"effect size is " ) text(length/2, 1.8,eval(effect)) }
part of a short guide to R
Version of December 31, 2004
William Revelle
Department of Psychology
Northwestern University