#adapted from John Fox's Polychor #this does all the work #the following two functions are called repeatedly by tetrac and are put here to speed up the process # "tetraBinBvn.old" <- # function (rho,rc,cc) #adapted from John Fox's polychor # { row.cuts <- c(-Inf, rc, Inf) # col.cuts <- c(-Inf, cc, Inf) # P <- matrix(0, 2,2) # R <- matrix(c(1, rho, rho, 1), 2, 2) # for (i in 1:2) { # for (j in 1:2) { # P[i, j] <- pmvnorm(lower = c(row.cuts[i], col.cuts[j]), # upper = c(row.cuts[i + 1], col.cuts[j + 1]), # corr = R) # }} # P #the estimated 2 x 2 predicted by rho, rc, cc # } #modified 5/8/14 to be consistent with call from tetraF #no change in functionality, just more esthetic #changed 10/16/14 to use sadmvn instead of mvtnorm "tetraBinBvn" <- function (rho,rc,cc) #adapted from John Fox's polychor { row.cuts <- c(-Inf, rc, Inf) col.cuts <- c(-Inf, cc, Inf) P <- matrix(0, 2,2) R <- matrix(c(1, rho, rho, 1), 2, 2) P[1,1] <- sadmvn(lower = c(row.cuts[1], col.cuts[1]), upper = c(row.cuts[2], col.cuts[2]), mean=c(0,0), varcov = R) P[2,1] <- pnorm(rc) - P[1,1] P[1,2] <- pnorm(cc) - P[1,1] P[2,2] <- 1- pnorm(rc) - P[1,2] P #the estimated 2 x 2 predicted by rho, rc, cc } "tetraF" <- function(rho,cc,rc,tab) { P <- tetraBinBvn(rho, rc, cc) -sum(tab * log(P)) } #the ML criterion to be minimized "tetrac" <- function(x,y=NULL,taux,tauy,i,j,correct=.5,global=TRUE,weight=NULL) { if(is.null(y)) {tab <- x} else { if(is.null(weight)) {tab <- tableVeryFast(x,y) } else {tab <- wtd.table(x,y,weight)} #switched to tableF for speed } #changed 9/8/14 #if((length(tab) < 4) | (is.na(sum(tab)) | ((tab[1,1] + tab[1,2]) < 1) | ((tab[2,1] + tab[2,2]) < 1) | ((tab[1,1] + tab[2,1]) < 1) | ((tab[2,1] + tab[2,2]) < 1))) {warning("For i = ", i," j = ",j, " No variance for either i or j rho set to NA") if((length(tab) < 4) | (is.na(sum(tab)) )) {warning("For i = ", i," j = ",j, " No variance for either i or j rho set to NA") result <- list(rho=NA,tau=c(NA,NA),objective=NA) } else { if((sum(tab) > 1) && (min(tab) == 0) && (correct > 0)) { message("For i = ", i," j = ",j, " A cell entry of 0 was replaced with correct = ", correct, ". Check your data!") tab[tab==0] <- correct #correction for continuity } ###### put in the weights here if(sum(tab)>0) { if(global) {cc <- taux rc <- tauy } else { tot <- sum(tab) tab <- tab/tot rc <- qnorm(colSums(tab))[1] cc <- qnorm(rowSums(tab))[1] } rho <- optimize(tetraF,interval=c(-1,1),rc=rc,cc=cc,tab=tab) result <- list(rho=rho$minimum,tau=c(cc,rc),objective=rho$objective) } else { result <- list(rho=NA,tau=c(NA,NA),objectiv=NA)} } return(result) } "wtd.table" <- function(x,y,weight) { tab <- tapply(weight,list(x,y),sum,na.rm=TRUE,simplify=TRUE) #taken from questionr:wtd.table tab[is.na(tab)] <- 0 return(tab) } #repeatedly do the analysis to form a matrix of output #added the pmin instead of min on Sept 10, 2013 "tetra.mat" <- function(x,y=NULL,correct=.5,smooth=TRUE,global=TRUE,weight=NULL) { #the functions to do parallelism myfun <- function(x,i,j) {if(t(!is.na(x[,i]))%*% (!is.na(x[,j])) > 2 ) { tetra <- tetrac(x[,i],x[,j],tau[i],tau[j],i,j,correct=correct,global=global,weight=weight)} else { tetra <- list(rho=NA,tau=c(NA,NA),objective=NA)}} matpLower <- function(x,nvar) { k <- 1 il <- vector() jl <- vector() for(i in 2:nvar) {for (j in 1:(i-1)) { il[k] <- i jl [k] <- j k<- k+1} } #for debugging, turn off mcmapply tet <- mcmapply(function(i,j) myfun(x,i,j) , il,jl) #tet <- mapply(function(i,j) myfun(x,i,j) , il,jl) #for debugging, we do not do parallel cores #now make it a matrix mat <- diag(nvar) mat[upper.tri(mat)] <- as.numeric(tet[1,]) #first row of poly is correlation, 2nd the fit mat <- t(mat) + mat diag(mat) <- 1 return(mat) } nvar <- dim(x)[2] mx <- apply(x,2,function(x) min(x,na.rm=TRUE)) x <- t(t(x) - mx) #x <- x -min(x,na.rm=TRUE) #in case the numbers are not 0,1 -- using pmin allows different minima for different variables n.obs <- dim(x)[1] if(is.null(y)) { if(max(x,na.rm=TRUE) > 1) {stop("Tetrachoric correlations require dictomous data")} if(is.null(weight)) {tau <- -qnorm(colMeans(x,na.rm=TRUE))} else {tau <- -qnorm(apply(x,2,function(y) weighted.mean(y,weight,na.rm=TRUE)))} #weighted tau mat <- matrix(0,nvar,nvar) colnames(mat) <- colnames(y) rownames(mat) <- colnames(x) names(tau) <- colnames(x) #cat("\nFinding the tetrachoric correlations\n") #for (i in 2:nvar) { #progressBar(i^2/2,nvar^2/2,"Tetrachoric") # for (j in 1:(i-1)) { # if(t(!is.na(x[,i]))%*% (!is.na(x[,j])) > 2 ) { # tetra <- tetrac(x[,i],x[,j],tau[i],tau[j],i,j,correct=correct,global=global,weight=weight) # mat[i,j] <- mat[j,i] <- tetra$rho} else {mat[i,j] <- mat[j,i] <- NA} # } # } # diag(mat) <- 1 mat <- matpLower(x,nvar) if(any(is.na(mat))) {warning("some correlations are missing, smoothing turned off") smooth <- FALSE} if(smooth) {mat <- cor.smooth(mat) } #makes it positive semidefinite colnames(mat) <- rownames(mat) <- colnames(x) result <- list(rho = mat,tau = tau,n.obs=n.obs) } else { # the case of having a y variable my <- apply(y,2,function(x) min(x,na.rm=TRUE)) #apply to y Dec 24, 2019 y <- t(t(y) - my) #y <- y -min(y,na.rm=TRUE) #in case the numbers are not 0,1 if(is.matrix(y)) {ny <- dim(y)[2] tauy <- -qnorm(colMeans(y,na.rm=TRUE)) n.obs.y <- dim(y)[1] } else { ny <- 1 n.obs.y <- length(y) tauy <- -qnorm(mean(y,na.rm=TRUE)) } y <- as.matrix(y) if(dim(x)[1] != n.obs.y) {stop("x and y must have the same number of observations")} taux <- -qnorm(colMeans(x,na.rm=TRUE)) nx <- dim(x)[2] mat <- matrix(0,nx,ny) colnames(mat) <- colnames(y) rownames(mat) <- colnames(x) for (i in 1:nx) { for (j in 1:ny) {tetra <- tetrac(x[,i],y[,j],taux[i],tauy[j],correct=correct) mat[i,j] <- tetra$rho } } colnames(mat) <- colnames(y) rownames(mat) <- colnames(x) mat <- t(mat) result <- list(rho = mat,tau = taux,tauy= tauy,n.obs=n.obs) } flush(stdout()) cat("\n" ) #put in to clear the progress bar return(result) } #convert comorbidity type numbers to a table pqr <- function(q1,q2=NULL,p=NULL) { if(length(q1) > 1) { q2 <- q1[2] p <- q1[3] q1 <- q1[1]} tab <- matrix(0,2,2) tab[1,1] <- p tab[2,1] <- q1-p tab[1,2] <- q2-p tab[2,2] <- 1-q1 - tab[1,2] return(tab)} #repeatedly do the analysis to form a matrix of output #added the pmin instead of min on Sept 10, 2013 "tetra.mat.sc" <- function(x,y=NULL,correct=.5,smooth=TRUE,global=TRUE,weight=NULL) { nvar <- dim(x)[2] mx <- apply(x,2,function(x) min(x,na.rm=TRUE)) x <- t(t(x) - mx) #x <- x -min(x,na.rm=TRUE) #in case the numbers are not 0,1 -- using pmin allows different minima for different variables n.obs <- dim(x)[1] if(is.null(y)) { if(max(x,na.rm=TRUE) > 1) {stop("Tetrachoric correlations require dictomous data")} if(is.null(weight)) {tau <- -qnorm(colMeans(x,na.rm=TRUE))} else {tau <- -qnorm(apply(x,2,function(y) weighted.mean(y,weight,na.rm=TRUE)))} #weighted tau mat <- matrix(0,nvar,nvar) names(tau) <- colnames(x) #cat("\nFinding the tetrachoric correlations\n") for (i in 2:nvar) { progressBar(i^2/2,nvar^2/2,"Tetrachoric") for (j in 1:(i-1)) { if(t(!is.na(x[,i]))%*% (!is.na(x[,j])) > 2 ) { tetra <- tetrac(x[,i],x[,j],tau[i],tau[j],i,j,correct=correct,global=global,weight=weight) mat[i,j] <- mat[j,i] <- tetra$rho} else {mat[i,j] <- mat[j,i] <- NA} } } diag(mat) <- 1 if(any(is.na(mat))) {warning("some correlations are missing, smoothing turned off") smooth <- FALSE} if(smooth) {mat <- cor.smooth(mat) } #makes it positive semidefinite #colnames(mat) <- rownames(mat) <- colnames(x) #probably alreday know these result <- list(rho = mat,tau = tau,n.obs=n.obs) } else { # the case of having a y variable my <- apply(x,2,function(x) min(x,na.rm=TRUE)) y <- t(t(y) - my) #y <- y -min(y,na.rm=TRUE) #in case the numbers are not 0,1 if(is.matrix(y)) {ny <- dim(y)[2] tauy <- -qnorm(colMeans(y,na.rm=TRUE)) n.obs.y <- dim(y)[1] } else { ny <- 1 n.obs.y <- length(y)} tauy <- -qnorm(mean(y,na.rm=TRUE)) y <- as.matrix(y) if(dim(x)[1] != n.obs.y) {stop("x and y must have the same number of observations")} taux <- -qnorm(colMeans(x,na.rm=TRUE)) nx <- dim(x)[2] mat <- matrix(0,nx,ny) colnames(mat) <- colnames(y) rownames(mat) <- colnames(x) for (i in 1:nx) { for (j in 1:ny) {tetra <- tetrac(x[,i],y[,j],taux[i],tauy[j],correct=correct) mat[i,j] <- tetra$rho } } result <- list(rho = mat,tau = taux,tauy= tauy,n.obs=n.obs) } flush(stdout()) cat("\n" ) #put in to clear the progress bar return(result) } #convert comorbidity type numbers to a table pqr <- function(q1,q2=NULL,p=NULL) { if(length(q1) > 1) { q2 <- q1[2] p <- q1[3] q1 <- q1[1]} tab <- matrix(0,2,2) tab[1,1] <- p tab[2,1] <- q1-p tab[1,2] <- q2-p tab[2,2] <- 1 - q1 - q2 + p return(tab)} #the public function "tetrachoric" <- function(x,y=NULL,correct=.5,smooth=TRUE,global=TRUE,weight=NULL,na.rm=TRUE,delete=TRUE) { # if(!require(mnormt)) {stop("I am sorry, you must have mnormt installed to use tetrachoric")} cl <- match.call() if (!is.matrix(x) && !is.data.frame(x)) { if (length(x) ==4) {x <- matrix(x,2,2) } else { if(length(x) ==3 ) {x <- pqr(x) } else { stop("Data must be either a 1 x 4 vector, a 2 x 2 matrix, a comorbidity table, or a data.frame/matrix of data")} }} nvar <- dim(x)[2] n.obs <- dim(x)[1] # if(!is.numeric(x)) {x <- matrix(as.numeric(x),ncol=nvar) # message("Converted non-numeric input to numeric")} if(!is.null(weight)) {if (length(weight)!= n.obs) stop("The number of weights must match the number of observations") } if (n.obs == nvar) {result <- tetrac(x,correct=correct,i=1,j=1,global=FALSE)} else { #first delete any bad cases item.var <- apply(x,2,sd,na.rm=na.rm) bad <- which((item.var <= 0)|is.na(item.var)) if((length(bad) > 0) & delete) { for (baddy in 1:length(bad)) {warning( "Item = ",colnames(x)[bad][baddy], " had no variance and was deleted")} x <- x[,-bad] nvar <- nvar - length(bad) } # parallel is now built into the system, so we don't need this. # if(!require(parallel)) {warning("need parallel installed to take advantage of multiple cores. Using single core version instead") # result <- tetra.mat.sc(x,y=y,correct=correct,smooth=smooth,global=global,weight=weight)} else { result <- tetra.mat(x,y=y,correct=correct,smooth=smooth,global=global,weight=weight)} result$Call <- cl class(result) <- c("psych","tetra") return(result) } #modified 1/14/14 to include the tableF function to double the speed for large problems #modified 12/25/19 to use tableVeryFast to be even be faster () "tetrachor" <- function(x,correct=.5) { #if(!require(mnormt)) {stop("I am sorry, you must have mnormt installed to use tetrachor")} cl <- match.call() if (!is.matrix(x) && !is.data.frame(x)) { if (length(x) ==4) {x <- matrix(x,2,2) } else { if(length(x) ==3 ) {x <- pqr(x) } else { stop("Data must be either a 1 x 4 vector, a 2 x 2 matrix, a comorbidity table, or a data.frame/matrix of data")} }} nvar <- dim(x)[2] if (dim(x)[1] == nvar) {result <- tetrac(x,correct=correct)} else { result <- tetra.mat(x,correct=correct)} result$Call <- cl class(result) <- c("psych","tetra") return(result) } #does the work "biserialc" <- function(x,y,i,j) { cc <- complete.cases(x,y) x <- x[cc] y <- y[cc] yf <- as.factor(y) lev <- levels(yf) if(length(lev)!=2) {#stop("y is not a dichotomous variable") warning("For x = ",i, " y = ", j, " y is not dichotomous") r <- NA} else { ty <- table(y) tot <- sum(ty) tab <- ty/tot if(length(tab) < 2) {r <- NA warning("For x = ",i, " y = ", j, " no variance for y r set to NA")} else { #this treats the case of no variance in the dichotmous variable zp <- dnorm(qnorm(tab[2])) hi <- mean(x[y==lev[2]],na.rm=TRUE) lo <- mean(x[y==lev[1]],na.ram=TRUE) # r <- (hi - lo)*sqrt(prod(tab))/(sd(x,na.rm=TRUE)) #point biserial r <- (hi - lo)*(prod(tab))/(zp * sd(x,na.rm=TRUE)) if(!is.na(r) && abs(r) >1 ) { if (r > 1) {r <- 1 } else {r <- -1} #in case we are correlating a dichotomous variable with itself warning("For x = ",i, " y = ", j, " x seems to be dichotomous, not continuous") }}} return(r) } "biserial" <- function(x,y) { x <- as.matrix(x,drop=FALSE) y <- as.matrix(y,drop=FALSE) nx <- dim(x)[2] ny <- dim(y)[2] if(is.null(nx)) nx <- 1 if(is.null(ny)) ny <- 1 mat <- matrix(NaN,nrow=ny,ncol=nx) colnames(mat) <- colnames(x) rownames(mat) <- colnames(y) #cat("\n Finding the biserial correlations\n") for(i in 1:ny) { progressBar(i*(i-1)/2,ny^2/2,"Biserial") for (j in 1:nx) { mat[i,j] <- biserialc(x[,j],y[,i],j,i) }} flush(stdout()) cat("\n" ) #put in to clear the progress bar return(mat) } "polyserial" <- function(x,y) { # y <- matrix(y) min.item <- min(y, na.rm = TRUE) max.item <- max(y, na.rm = TRUE) if(is.null(ncol(y))) {n.var <- 1 n.cases <- length(y) } else {n.var <- ncol(y) n.cases <- dim(y)[1]} dummy <- matrix(rep(min.item:max.item, n.var), ncol = n.var) colnames(dummy) <- names(y) xdum <- rbind(y, dummy) frequency <- apply(xdum, 2, table) frequency <- t(frequency - 1) responses <- rowSums(frequency) frequency <- frequency/responses frequency <- t(apply(frequency,1,cumsum)) len <- dim(frequency)[2] tau <- dnorm(qnorm(frequency[,-len,drop=FALSE])) stau <- rowSums(tau) rxy <- cor(x,y,use="pairwise") sdy <- apply(y,2,sd,na.rm=TRUE) rps <- t(rxy) * sqrt((n.cases-1)/n.cases) * sdy/stau rps[rps > 1.0] <- 1.0 rps[rps < -1.0] <- -1.0 return(rps) } #modified November 28, 2014 to be slightly more aggressive about smoothing #modified January 18l, 2023 to smooth covariances as well #this is more similar to cov2cor(nearPD$mat) "cor.smooth" <- function(x,eig.tol=10^-12) { if(!isCorrelation(x) & !isCovariance(x)) {x <- cor(x,use="pairwise") message("Pearson correlations of the raw data were found")} eigens <- try(eigen(x),TRUE) if(inherits(eigens, as.character("try-error"))) {warning('I am sorry, there is something seriously wrong with the correlation matrix,\ncor.smooth failed to smooth it because some of the eigen values are NA. \nAre you sure you specified the data correctly?') } else { if(min(eigens$values) < .Machine$double.eps) {warning("Matrix was not positive definite, smoothing was done") #eigens$values[eigens$values < .Machine$double.eps] <- 100 * .Machine$double.eps eigens$values[eigens$values < eig.tol] <- 100 * eig.tol nvar <- dim(x)[1] tot <- sum(eigens$values) eigens$values <- eigens$values * nvar/tot cnames <- colnames(x) rnames <- rownames(x) x <- eigens$vectors %*% diag(eigens$values) %*% t(eigens$vectors) x <- cov2cor(x) colnames(x) <- cnames rownames(x) <- rnames} } return(x)} #modified January 9, 2012 to add the try so we don't fail (just complain) if the data are bad. #identify the most likely candidates for a bad item "cor.smoother" <- function(x,cut=.01) { nvar <- ncol(x) result <- list() if(nrow(x) != nvar) x <- cor(x,use="pairwise") bad <- rep(NA,nvar) good <- rep(TRUE,nvar) names(good) <- names(bad) <- colnames(x) for (i in 1:nvar) { ev <- eigen(x[-i,-i])$values if(any(ev < 0) ) {bad[i] <- TRUE good[i] <- FALSE} bad[i] <- sum((ev < 0),na.rm=TRUE) } if(sum(bad+0) > 0 ) {result$good <- bad[(bad > 0)] result$bad <- good[good] s <- cor.smooth(x) possible <- arrayInd(which.max(abs(s-x)),dim(x),.dimnames=colnames(x)) result$likely <- colnames(x)[possible] result$possible <- arrayInd(which(abs(s-x) > cut),dim(x),.dimnames=colnames(x)) result$possible <- sort(table(colnames(x)[result$possible]),decreasing=TRUE) } else {result$bad <- c("all ok")} class(result) <- c("psych","smoother") return(result) } tableVeryFast <- function(x,y){ #just takes 0,1 data #maxxy <- 4 #(maxx+(minx==0))*(maxy+(minx==0)) bin <- x + y*2+ 1 dims=c(2 ,2) ans <- matrix(tabulate(bin,4),dims) ans }