#useful functions for psychometrics and personality research #a growing compilation of simple functions that I have found useful for doing personality research #last updated Feb 14, 2005 #by William Revelle #includes #alpha.scale #find coefficient alpha for a scale and a dataframe of items #describe give means, sd, skew, n, and se #summ.stats #basic summary statistics by a grouping variable #error.crosses (error bars in two space) #skew find skew #panel.cor taken from the examples for pairs #pairs.panels adapted from panel.cor -- gives a splom, histogram, and correlation matrix #multi.hist #plot multiple histograms #correct.cor #given a correlation matrix and a vector of reliabilities, correct for reliability #fisherz #convert pearson r to fisher z #paired.r #test for difference of dependent correlations #count.pairwise #count the number of good cases when doing pairwise analysis #eigen.loadings #convert eigen vector vectors to factor loadings by unnormalizing them #principal #yet another way to do a principal components analysis -- brute force eignvalue decomp #factor.congruence #find the factor congruence coeffiecints #factor.model #given a factor model, find the correlation matrix #factor.residuals #how well does it fit? #factor.rotate # rotate two columns of a factor matrix by theta (in degrees) #phi2poly #convert a matrix of phi coefficients to polychoric correlations #define a function to calculate coefficient alpha alpha.scale=function (x,y) #find coefficient alpha given a scale and a data.frame of the items in the scale { n=length(y) #number of variables Vi=sum(diag(var(y,na.rm=TRUE))) #sum of item variance Vt=var(x,na.rm=TRUE) #total test variance ((Vt-Vi)/Vt)*(n/(n-1))} #alpha #general descriptive statistics describe <- function (x, digits = 2,na.rm=TRUE) #basic stats after dropping non-numeric data { #first, define a local function valid <- function(x) { return(sum(!is.na(x))) } if (is.vector(x) ) #do it for vectors or { stats = matrix(rep(NA,6),ncol=6) #create a temporary array stats[1, 1] <- mean(x, na.rm=na.rm ) stats[1, 2] <- median(x,na.rm=na.rm ) stats[1, 3] <- min(x, na.rm=na.rm ) stats[1, 4] <- max(x, na.rm=na.rm ) stats[1, 5] <- skew(x,na.rm=na.rm ) stats[1, 6] <- valid(x ) len <- 1; } else {len = dim(x)[2] #do it for matrices or data.frames stats = matrix(rep(NA,len*6),ncol=6) #create a temporary array for (i in 1:len) { if (is.numeric(x[,i])) { #just do this for numeric data stats[i, 1] <- mean(x[,i], na.rm=na.rm ) stats[i, 2] <- median(x[,i],na.rm=na.rm ) stats[i, 3] <- min(x[,i], na.rm=na.rm ) stats[i, 4] <- max(x[,i], na.rm=na.rm ) stats[i, 5] <- skew(x[,i],na.rm=na.rm ) stats[i, 6] <- valid(x[,i] ) } } } temp <- data.frame(n = stats[,6],mean = stats[,1], sd = sd(x,na.rm=TRUE), median = stats[, 2],min= stats[,3],max=stats[,4], range=stats[,4]-stats[,3],skew = stats[, 5]) answer <- round(data.frame(V=seq(1:len),temp, se = temp$sd/sqrt(temp$n)), digits) return(answer) } # find basic summary statistics by groups #adapted from "Kickstarting R" summ.stats <- function (x,y) { #data are x, grouping variable is y means <- tapply(x, y, mean, na.rm=TRUE) sds <- tapply(x,y, sd, na.rm = TRUE) valid <- function (x) {return(sum(!is.na(x)) )} ns <- tapply(x,y, valid ) se=sds/sqrt(ns) answer <-data.frame(mean=means,sd=sds,se=se,n=ns) } #error bars in a two dimensional plot old.error.crosses <-function (x,y,z) # x and y are data frame with z points {for (i in 1:z) {xcen <- x$mean[i] ycen <- y$mean[i] xse <- x$se[i] yse <- y$se[i] arrows(xcen-xse,ycen,xcen+xse,ycen,length=.2, angle = 90, code=3,col = par("fg"), lty = NULL, lwd = par("lwd"), xpd = NULL) arrows(xcen,ycen-yse,xcen,ycen+yse,length=.2, angle = 90, code=3,col = par("fg"), lty = NULL, lwd = par("lwd"), xpd = NULL) } } #error bars in a two dimensional plot with labels error.crosses <-function (x,y,labels=NULL,pos=NULL,arrow.len=.2,...) # x and y are data frame with {z <- dim(x)[1] if (length(pos)==0) {locate<-rep(1,z)} else {locate <- pos} if (length(labels)==0) lab<- rep("",z) else lab <-labels for (i in 1:z) {xcen <- x$mean[i] ycen <- y$mean[i] xse <- x$se[i] yse <- y$se[i] arrows(xcen-xse,ycen,xcen+xse,ycen,length=arrow.len, angle = 90, code=3,col = par("fg"), lty = NULL, lwd = par("lwd"), xpd = NULL) arrows(xcen,ycen-yse,xcen,ycen+yse,length=arrow.len, angle = 90, code=3,col = par("fg"), lty = NULL, lwd = par("lwd"), xpd = NULL) text(xcen,ycen,labels=lab[i],pos=positions[i],cex=1,offset=arrow.len+1) #puts in labels for all points } } #examples # plot(mPA,mNA,pch=symb[condit],cex=4.5,col=colors[condit],bg=colors[condit],main="Postive vs. Negative Affect",xlim=c(-1,1.5),ylim=c(-1,1.5),xlab="Positive Affect",ylab="Negative Affect") #error.crosses(paf.stats,naf.stats,4) #put in x and y error bars! #find a vector of skews for a data matrix skew= function (x, na.rm = FALSE) { if (na.rm) x <- x[!is.na(x)] #remove missing values sum((x - mean(x))^3)/(length(x) * sd(x)^3) #calculate skew } #some fancy graphics -- adapted from help.cor panel.cor.scale <- function(x, y, digits=2, prefix="", cex.cor) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r = (cor(x, y,use="pairwise")) txt <- format(c(r, 0.123456789), digits=digits)[1] txt <- paste(prefix, txt, sep="") if(missing(cex.cor)) cex <- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex = cex * abs(r)) } panel.cor <- function(x, y, digits=2, prefix="", cex.cor) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r = (cor(x, y,use="pairwise")) txt <- format(c(round(r,digits), 0.123456789), digits=digits)[1] txt <- paste(prefix, txt, sep="") if(missing(cex.cor)) cex <- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex = cex ) } panel.hist <- function(x, ...) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(usr[1:2], 0, 1.5) ) h <- hist(x, plot = FALSE) breaks <- h$breaks; nB <- length(breaks) y <- h$counts; y <- y/max(y) rect(breaks[-nB], 0, breaks[-1], y, col="cyan", ...) } pairs.panels <- function (x,y,smooth=TRUE,scale=FALSE,digits=2) #combines a splom, histograms, and correlations {if (smooth ){ if (scale) { pairs(x,diag.panel=panel.hist,upper.panel=panel.cor.scale,lower.panel=panel.smooth) } else {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor,lower.panel=panel.smooth) } #else {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor,lower.panel=panel.smooth) } else #smooth is not true { if (scale) {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor.scale) } else {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor) } } #end of else (smooth) } #end of function multi.hist <- function(x,...) {nvar <- dim(x)[2] #number of variables nsize=trunc(sqrt(nvar))+1 #size of graphic old.par <- par(no.readonly = TRUE) # all par settings which can be changed par(mfrow=c(nsize,nsize)) #set new graphic parameters for (i in 1:nvar) { name=names(x)[i] #get the names for the variables hist(x[,i],main=name,xlab=name,...) } #draw the histograms for each variable on.exit(par(old.par)) #set the graphic parameters back to the original } #function to do lowess fitting if we have missing data lowess.na <- function(x, y = NULL, f = 2/3,...) { #do lowess with missing data x1 <- subset(x,(!is.na(x)) &(!is.na(y))) y1 <- subset(y, (!is.na(x)) &(!is.na(y))) lowess.na <- lowess(x1,y1,f, ...) } #function to replace upper triangle of matrix with unattenuated correlations, and the diagonal with reliabilities #input is a correlation matrix and a vector of reliabilities correct.cor <- function(x,y) { n=dim(x)[1] { x[1,1] <- y[1,1] for (i in 2:n) { x[i,i] <- y[1,i] #put reliabilities on the diagonal k=i-1 for (j in 1:k) { x[j,i] <- x[j,i]/sqrt(y[1,i]*y[1,j]) } #fix the upper triangular part of the matrix } return(x) }} #difference of dependent (paired) correlations (following Steiger,J., 1980, Tests for comparing elements of a correlation matrix. Psychological Bulletin, 87, 245-251) paired.r <- function(xy,xz,yz,n) { diff <- xy-xz determin=1-xy*xy - xz*xz - yz*yz + 2*xy*xz*yz av=(xy+xz)/2 cube= (1-yz)*(1-yz)*(1-yz) t2 = diff * sqrt((n-1)*(1+yz)/(((2*(n-1)/(n-3))*determin+av*av*cube))) return(t2) } fisherz <- function(rho) {0.5*log((1+rho)/(1-rho)) } #converts r to z #a function to call John Fox's polychor multiple times to form a matrix of polychoric correlations #needs package polychor polychor.matrix <-function(x,y=NULL) { sizex <- dim(x)[2] if (((is.data.frame(y))|(is.matrix(y)))) sizey<-dim(y)[2] else sizey <- dim(x)[2] result<-matrix(1,nrow=sizey,ncol=sizex) #create the output array xnames<- names(x) colnames(result)<- names(x) if (((is.data.frame(y))|(is.matrix(y)))) rownames(result) <- names(y) else rownames(result) <- names(x) if (!((is.data.frame(y))|(is.matrix(y)))) { #default case returns a square matrix for (i in 2: sizex ) { for (j in 1:( i-1)) { result[j,i]<-polychor(table(x[,j],x[,i]) ) result[i,j] <- result[j,i] } } } else { #if y is input, then return the rectangular array for (i in 1: sizex ) { for (j in 1:sizey) { result[j,i]<-polychor(table(x[,i],y[,j]) ) } } } return (result) } #count.pairwise -- count the number of good cases when doing pairwise analysis (e.g.,for correlations) count.pairwise <- function(x,y) { sizex <- dim(x)[2] if (is.data.frame(y)) sizey<-dim(y)[2] else sizey <- dim(x)[2] result<-matrix(1,nrow=sizey,ncol=sizex) #create the output array xnames<- names(x) colnames(result)<- names(x) if (((is.data.frame(y))|(is.matrix(y)))) rownames(result) <- names(y) else rownames(result) <- names(x) if (!is.data.frame(y)) { #default case returns a square matrix for (i in 2: sizex ) { for (j in 1:( i-1)) { result[j,i]<- sum((!is.na(x[,j]))&(!is.na(x[,i]))) result[i,j] <- result[j,i] } } } else { #if y is input, then return the rectangular array for (i in 1: sizex ) { for (j in 1:sizey) { result[j,i]<-sum((!is.na(x[,i]))&(!is.na(y[,j]))) } } } return (result) } #minor routines to manipulate matrices for factor analysis #output from eigen is two lists: $values and $vectors # eigen.loadings <- function (x) { #convert eigen vectors to loadings by unnormalizing them n <- length(x$values) x$values[ x$values<0]<- 0 fix<-sqrt(x$values) result<- x$vectors * rep(fix, each = n)} #do a principal components analysis by doing eigen vector decomposition and then showing the biggest nfactors principal <- function(r,nfactors=0) { n <- dim(r)[1] result<-list(values=c(rep(0,n)),loadings=matrix(rep(0,n*n),ncol=n),residual=matrix(rep(0,n*n),ncol=n),fit=0) eigens <-eigen(r) #call the eigen value decomposition routine result$values <-eigens$values result$loadings <-eigen.loadings(eigens) rownames(result$loadings) <- rownames(r) if (nfactors>0) result$loadings<-result$loadings[,1:nfactors] residual<- factor.residuals(r,result$loadings) r2<- sum(r*r) rstar2<- sum(residual*residual) result$residual<-residual result$fit<- 1-rstar2/r2 return(result) } #function to find factor congruence ( factor.congruence <- function (x,y) { nx<- dim(x)[2] ny<- dim(y)[2] cross<- t(y) %*% x #inner product will have dim of ny * nx sumsx<- sqrt(1/diag(t(x)%*%x)) sumsy<- sqrt(1/diag(t(y)%*%y)) result<- matrix(rep(0,nx*ny),ncol=nx) result<- sumsy * (cross * rep(sumsx, each = ny)) return(result) } #estimate a correlation matrix given a particular factor/component model factor.model <- function(x) { result<- x%*%t(x) return (result)} #find residuals factor.residuals <- function(r, f) { rstar<- r- factor.model(f) return(rstar)} factor.fit <-function (r,f) { r2 <-sum( r*r) rstar <- factor.residuals(r,f) rstar2 <- sum(rstar*rstar) fit<- 1- rstar2/r2 return(fit) } #do a principal components analysis by doing eigen vector decomposition and then showing the biggest nfactors principal <- function(r,nfactors=0) { n <- dim(r)[1] result<-list(values=c(rep(0,n)),loadings=matrix(rep(0,n*n),ncol=n),residual=matrix(rep(0,n*n),ncol=n),fit=0) eigens <-eigen(r) #call the eigen value decomposition routine result$values <-eigens$values result$loadings <-eigen.loadings(eigens) if (nfactors>0) result$loadings<-result$loadings[,1:nfactors] residual<- factor.residuals(r,result$loadings) r2<- sum(r*r) rstar2<- sum(residual*residual) result$residual<-residual result$fit<- 1-rstar2/r2 return(result) } factor.rotate <-function(f,angle,col1,col2) { nvar<- dim(f)[2] rot<- matrix(rep(0,nvar*nvar),ncol=nvar) rot[cbind(1:nvar, 1:nvar)] <- 1 theta<- 2*pi*angle/360 rot[col1,col1]<-cos(theta) rot[col2,col2]<-cos(theta) rot[col1,col2]<- -sin(theta) rot[col2,col1]<- sin(theta) result <- f %*% rot return(result) } #phi coefficient from a 2 x 2 table (function probably already exists but I can't find it) phi <- function(t) { #t is a 2 x 2 matrix r.sum<-rowSums(t) c.sum <-colSums(t) total<-sum(r.sum) r.sum<- r.sum/total c.sum <-c.sum/total v<- prod(r.sum,c.sum) phi <- (t[1,1]/total- c.sum[1]*r.sum[1])/sqrt(v) } #an alternative function to do the same, need to combine into one phi1 <- function(x) { # returns the phi coefficient x is a set of 4 numbers e.g. c(1,2,34) xm <- matrix(x,ncol=2) total <- sum(xm) xp <- xm/total HR <- xp[1,1] + xp[1,2] SR <- xp[1,1]+ xp[2,1] VP <- xp[1,1] phi <- (VP - HR*SR) /sqrt(HR*(1-HR)*(SR)*(1-SR)) return(phi) } #convert a phi coefficient to the corresponding polychoric correlation #needs John Fox's polychor program phi2poly <- function(ph,cp,cc) { #ph is the phi coefficient #cp is the selection ratio of the predictor #cc is the success rate of the criterion r.marg<-rep(0,2) c.marg<- rep(0,2) p<-array(rep(0,4),dim=c(2,2)) r.marg[1]<- cp r.marg[2]<- 1 -cp c.marg[1]<- cc c.marg[2]<- 1-cc p[1,1]<- r.marg[1]*c.marg[1]+ ph*sqrt(prod(r.marg,c.marg)) p[2,2]<- r.marg[2]*c.marg[2]+ ph*sqrt(prod(r.marg,c.marg)) p[1,2]<- r.marg[1]*c.marg[2]- ph*sqrt(prod(r.marg,c.marg)) p[2,1]<- r.marg[2]*c.marg[1]- ph*sqrt(prod(r.marg,c.marg)) result<-polychor(p ) return(result)} psycho.demo <- function() { #simulate correlation matrix with variable cut points -- psychometric demo #make up some correlations with different cut points cuts <-c(-2,-1,0,1,2) nsub<-1000 #how many subjects r<-.6 #population correlation of observed with theta theta <-rnorm(nsub) #make up some random normal theta scores err<- rnorm(nsub) #random normal error scores obser<- theta*(r) + err*sqrt(1-r*r) #observed = T + E #convert to 0/1 with different values of cuts trunc<- matrix(rep(obser,length(cuts)),ncol=length(cuts)) for (i in 1:length(cuts)) { trunc[obser>cuts[i],i]<- 1 trunc[obser< cuts[i],i]<- 0} d.mat<- data.frame(theta,obser,trunc) #combine into a data frame trunc.cor<- cor(d.mat) #find the correlations freq <- mean(d.mat) #find the frequencies of scores #now, convert the upper diagonal to polychorics using John Fox's polychor and my phi2poly for (i in 4:length(d.mat)) { for (j in 3:i) { trunc.cor[j,i]<- phi2poly(trunc.cor[i,j],freq[i],freq[j]) }} } #quick demonstration of a random walk process #to show range of variability over trials and replications #the "confidence lines" represent 2 sd of theoretical sum random.walk <- function(length=200,n=20,effect=0) { num <- seq(1:length) colors <- rainbow(n) #select colors x <- cumsum(rnorm(length,effect))/num plot(x,ylim = c(-2.5,2.5),typ="b",col=colors[1],main="Sample means as function of sample size",ylab="sample mean",xlab="sample size") for (i in 2:n) { x <- cumsum(rnorm(length,effect))/num #find the next line points(x,col=colors[i],typ="b") #draw it } curve(2/sqrt(x),1,length,101,add=TRUE) #upper confidence region curve(-2/sqrt(x),1,length,101,add=TRUE) #lower confidence region text(length/2,2,"effect size is " ) text(length/2, 1.8,eval(effect)) } #read from clipboard for Mac (thanks to Ken Knoblauch for this hint #for PCs, use read.table(file("clipboard")) read.clipboard<-function(header=TRUE,...) { MAC<-Sys.info()[1]=="Darwin" #are we on a Mac using the Darwin system? if (!MAC ) {if (header) read.clipboard<-read.table(file("clipboard"),header=TRUE,...) else read.clipboard<-read.table(file("clipboard"),...) } else { if (header) read.clipboard<- read.table(pipe("pbpaste"),header=TRUE,...) else read.clipboard<- read.table(pipe("pbpaste") ,...)} } read.clipboard.csv<-function(header=TRUE,sep=',',...) { #same as read.clipboard(sep=',') MAC<-Sys.info()[1]=="Darwin" #are we on a Mac using the Darwin system? if (!MAC ) {if (header) read.clipboard<-read.table(file("clipboard"),header=TRUE,...) else read.clipboard<-read.table(file("clipboard"),...) } else { if (header) read.clipboard<- read.table(pipe("pbpaste"),header=TRUE,...) else read.clipboard<- read.table(pipe("pbpaste") ,...)} } #functions to help analyze data from correlation matrices #used as part of the Synthetic Aperture Personality Assessment (SAPA) project # #extract cluster keys from a factor/principal components loadings matrix factor2cluster <- function(r,nfactors=0) { n <- dim(r)[1] result<-list(values=c(rep(0,n)),loadings=matrix(rep(0,n*n),ncol=n),residual=matrix(rep(0,n*n),ncol=n),fit=0) eigens <-eigen(r) #call the eigen value decomposition routine result$values <-eigens$values result$loadings <-eigen.loadings(eigens) rownames(result$loadings) <- rownames(r) if (nfactors>0) result$loadings<-result$loadings[,1:nfactors] residual<- factor.residuals(r,result$loadings) r2<- sum(r*r) rstar2<- sum(residual*residual) result$residual<-residual result$fit<- 1-rstar2/r2 return(result) } #find the correlation matrix of scales made up of items defined in a keys matrix (e.g., extracted by factor2cluster) #takes as input the keys matrix as well as a correlation matrix of all the items cluster.cor <- function(keys,r.mat,correct=TRUE) { #function to extract clusters according to the key vector #default is to correct for attenuation and show this above the diagonal #find the correlation matrix of scales made up of items defined in a keys matrix (e.g., extracted by factor2cluster) #takes as input the keys matrix as well as a correlation matrix of all the items if(!is.matrix(keys)) keys <- as.matrix(keys) #keys are sometimes a data frame - must be a matrix covar <- t(keys) %*% r.mat %*% keys #matrix algebra is our friend var <- diag(covar) sd.inv <- 1/sqrt(var) ident.sd <- diag(sd.inv,ncol = length(sd.inv)) cluster.correl <- ident.sd %*% covar %*% ident.sd key.var <- diag(t(keys) %*% keys) key.alpha <- ((var-key.var)/var)*(key.var/(key.var-1)) key.alpha[is.nan(key.alpha)] <- 1 #if only 1 variable to the cluster, then alpha is undefined key.alpha[!is.finite(key.alpha)] <- 1 colnames(cluster.correl) <- names(key.alpha) rownames(cluster.correl) <- names(key.alpha) if (correct) {cluster.corrected <- correct.cor(cluster.correl,t(key.alpha))} #correct for attenuation return(list(cor=cluster.correl,sd=sqrt(var),corrected= cluster.corrected)) } #a function to extract subsets of variables (a and b) from a correlation matrix m #and find the multiple correlation beta weights + R2 of the a set predicting the b set mat.regress <- function(m,a,b) { #first reorder the matrix to select the right variables nm <- dim(m)[1] t.mat <- matrix(0,ncol=nm,nrow=nm) ab <- c(a,b) numa <- length(a) numb <- length(b) nab <- numa+numb for (i in 1:nab) { t.mat[i,ab[i]] <- 1 } reorder <- t.mat %*% m %*% t(t.mat) a.matrix <- reorder[1:numa,1:numa] b.matrix <- reorder[1:numa,(numa+1):nab] model.mat <- solve(a.matrix,b.matrix) #solve the equation bY~aX if (length(b) >1 ) { rownames(model.mat) <- rownames(m)[a] colnames(model.mat) <- colnames(m)[b] r2 <- colSums(model.mat * b.matrix) } else { r2 <- sum(model.mat * b.matrix) names(model.mat) <- rownames(m)[a] names(r2) <- colnames(m)[b]} mat.regress <- list(beta=model.mat,r2=r2) return(mat.regress) }