c#developed July 4, 2012 #modified July 9, 2014 to allow polychorics within groups #modified June 2, 2015 to include covariance, pearson, spearman, poly ,etc. in correlations #Fixed March 3, 2017 to not weight empty cells in finding ICCs #some ideas taken from Bliese multilevel package (specifically, the WABA results) #modifed November 3, 2018 to allow a single DV (in case cohen.d is just comparing two groups on one DV) #corrected January 1, 2019 to allow for grouping variables that are characters #March, 2020 Added the ability to weight groups with external weights #Jan 2022, Added the nWg object to count cell sizes within groups #Dec 2023 Added the within group slope object as output #Dec 2023 Added the ability to specify the min.n (minimum group size) #Jan 2024 Added the check for being a tibble "statsBy" <- function (data, group, cors=FALSE, cor="cor", method="pearson", use="pairwise", poly=FALSE, na.rm=TRUE, alpha=.05, minlength=5, weights=NULL, mean.weights=NULL, min.n= 1) { # min size added 12/29/23 cl <- match.call() valid <- function(x) { #count the number of valid cases sum(!is.na(x)) } valid1 <- function(x) { #count the number of valid cases sum(!is.na(x[,1,drop=FALSE] ) ) } #define a function to count pairwise observations pairwise <- function(x) {n <- t(!is.na(x)) %*% (!is.na(x)) n} #get the grouping information if(inherits(data[1], "data.frame",which=TRUE) >1) data <- fix.dplyr(data) #to get around a problem created by dplyr (being a tibble) if(length(group) < NROW(data) ){ #added 01/01/19 to handle the case of non-numeric grouping data if(is.character(group) ) { gr <- which(colnames(data) %in% group) } else {gr <- group} } else {data <- cbind(data,group) group <- "group" gr <- which(colnames(data) %in% group)} z1 <- data[,group] z <- z1 cnames <- colnames(data) for (i in 1:ncol(data)) {if(is.factor(data[,i]) || is.logical(data[,i]) ) { data[,i] <- as.numeric(data[,i])} if (is.character(data[,i])) data[,i] <- as.numeric(as.factor(data[,i])) # colnames(data)[i] <- paste(cnames[i],"*",sep="") } xvals <- list() #find the statistics by group tempcount<- by(data,z,valid1) #this identifies the number of cases per z if(NCOL(tempcount)==1) {tempz <- tempcount[tempcount >= min.n] #but works only for one criteria tt <- z1 %in% names(tempz) data <- data[tt,] z <- z[tt]} if(is.null(mean.weights)){ temp <- by(data,z,colMeans,na.rm=na.rm)} else { #do something clever temp <- list() mean.weights <- mean.weights[tt] temp.data <- cbind(data,mean.weights) loc.mean.weights <- ncol(temp.data) tab.z <- table(z) for(i in 1:length(tab.z)) { temp.data_i <- temp.data[z==i,] temp[[i]] <- apply(temp.data_i,2,weighted.mean,w=temp.data_i[,"mean.weights"],na.rm=TRUE) temp[[i]] <- temp[[i]][-loc.mean.weights] } } # if(is.null(mean.weights)){ temp <- by(data,z,colMeans,na.rm=na.rm)} else { # #do something clever # temp <- by(data,z, FUN,apply(x,weighted.mean(w=mean.weights)))} rowname <- dimnames(temp)[[1]] if(length(dimnames(temp))> 1){ #if we have multiple criteria, we need to name them for (i in 2:length(dimnames(temp))) { rowname <- paste0(rep(rowname,each=length(dimnames(temp)[[i]])),"-",dimnames(temp)[[i]])} } rownn <- lapply(temp,is.null) #drop rows without values if(sum(as.integer(rownn)) > 0) { rowname <- rowname[-which(rownn==TRUE)] } # look for missing criteria xvals$mean <- t(matrix(unlist(temp),nrow=ncol(data))) xvals$sd <-t(matrix(unlist(by(data,z,function(x) sapply(x,sd,na.rm=na.rm))),nrow=ncol(data))) xvals$n <- t(matrix(unlist(by(data,z,function(x) sapply(x,valid))),nrow=ncol(data))) colnames(xvals$mean) <- colnames(xvals$sd) <- colnames(xvals$n) <- colnames(data) rownames(xvals$mean) <- rownames(xvals$sd) <- rownames(xvals$n) <- rowname # nH <- harmonic.mean(xvals$n) #this will be 0 if any is zero -- but is not used anyway nG <- colSums(!is.na(xvals$mean)) #fixed this so it is just for the cells that are not NA GM <- colSums(xvals$mean*xvals$n,na.rm=na.rm)/colSums(xvals$n,na.rm=na.rm) MSb <- colSums(xvals$n*t((t(xvals$mean) - GM)^2),na.rm=na.rm)/(nG-1) #weight means by n MSw <- colSums(xvals$sd^2*(xvals$n-1*(xvals$n>0)),na.rm=na.rm)/(colSums(xvals$n-1*(xvals$n>0)))#find the pooled sd #fix this for 0 cell size xvals$F <- MSb/MSw N <- colSums(xvals$n) #overall N for each variable npr <- (colSums(xvals$n-1*(xvals$n > 0))+colSums(xvals$n >0))/(colSums(xvals$n >0)) xvals$ICC1 <- (MSb-MSw)/(MSb + MSw*(npr-1)) xvals$ICC2 <- (MSb-MSw)/(MSb) #now, figure out the cis for the ICCs #taken from the ICC function F11 <- MSb/MSw #df11n <- n.obs-1 #df11d <- n.obs*(nj-1) df11n <- nG - 1 df11d <- nG* (npr-1) # p11 <- 1-pf(F11,df11n,df11d) p11 <- -expm1(pf(F11,df11n,df11d,log.p=TRUE)) #F21 <- MSB/MSE df21n <- N - 1 df21d <- N * (npr-1) # p21 <- 1-pf(F21,df21n,df21d) # F31 <- F21 F1L <- F11 / qf(1-alpha/2,df11n,df11d) F1U <- F11 * qf(1-alpha/2,df11d,df11n) L1 <- (F1L-1)/(F1L+(npr-1)) U1 <- (F1U -1)/(F1U+(npr-1)) L2 <- 1-1/F1L U2 <- 1 -1/F1U xvals$ci1 <-as.matrix(data.frame(L1 = L1,U1 = U1) ) xvals$ci2 <- as.matrix(data.frame(L2 = L2,U2 = U2) ) nWg<- NULL #if we don't find within group correlations, make this NULL #if we want within group correlations, then find them # if(cors) {if(!poly) { r <- by(data,z,function(x) cor(x[-gr],use="pairwise",method=method)) } else { r <- by(data,z,function(x) polychoric(x[-gr])$rho)} #added 02/06/15 if(cors) {if (poly) {cor <- "poly"} switch(cor, cor = {r <- by(data,z,function(x) cor(x[-gr],use=use,method=method))}, cov = {r <- by(data,z,function(x) cov(x[-gr],use=use)) covar <- TRUE}, tet = {r <- by(data,z,function(x) tetrachoric(x[-gr])$rho)}, poly = {r <- by(data,z,function(x) polychoric(x[-gr])$rho)}, mixed = {r <- by(data,z,function(x) mixedCor(x[-gr])$rho)} ) nWg <- by(data,z,function(x) pairwise(x[-gr])) #drop the group nvars <- ncol(r[[1]]) xvals$r <- r #store them as square matrices length.r <- length(r) # xvals$r.ci <- r xvals$r.ci <- mapply(cor.Ci, r=r, n=nWg) attributes(xvals$r.ci) <- attributes(r) lower <- lapply(r,function(x) if(!is.null(x)){ x[lower.tri(x)]}) xvals$within <- t(matrix(unlist(lower),nrow=nvars*(nvars-1)/2)) #string them out as well cnR <- abbreviate(cnames[-gr],minlength=minlength) k <- 1 colnames(xvals$within) <- paste("V",1:ncol(xvals$within)) for(i in 1:(nvars-1)) {for (j in (i+1):nvars) { colnames(xvals$within)[k] <- paste(cnR[i],cnR[j],sep="-") k<- k +1 }} # rownames(xvals$within) <- paste0("z",names(xvals$r)) rownames(xvals$within) <- rowname if(is.null(weights)) { wt <- by(data,z,function(x) pairwiseCount(x[-gr])) #the normal way is to weight by sample size lower.wt <- t(matrix(unlist(lapply(wt,function(x) if(!is.null(x)) { x[lower.tri(x)]}) ) ,nrow=nvars*(nvars-1)/2)) } else { # wt <- matrix(rep(weights,nvars),ncol=nvars) lower.wt <- matrix(rep(weights, nvars * (nvars-1)/2),ncol= nvars * (nvars-1)/2) } #weights are specified in the call lower.wt <- t(t(lower.wt)/colSums(lower.wt,na.rm=TRUE)) #converting to fractions #changed 03/03/20 to weight fisher Z transformed correlations pool <- colSums(fisherz2r( lower.wt * fisherz(xvals$within)),na.rm=TRUE) pool.sd <- apply(xvals$within, 2,FUN=sd, na.rm=TRUE) xvals$pooled <- matrix(0,nvars,nvars) xvals$pooled[lower.tri(xvals$pooled)] <- pool xvals$pooled <- xvals$pooled + t(xvals$pooled) #changed, May 12 to properly reflect values diag(xvals$pooled) <- 1 xvals$sd.r <- matrix(NaN,nvars,nvars) xvals$sd.r[lower.tri(xvals$sd.r)] <- pool.sd xvals$sd.r[upper.tri(xvals$sd.r)] <- pool.sd colnames(xvals$pooled) <- rownames (xvals$pooled) <- cnames[-gr] } nvar <- ncol(data)-length(group) #we have dropped the grouping variable # if(!poly) {xvals$raw <- cor(data,use="pairwise",method=method)} else {xvals$raw <- polychoric(data)$rho} ##added 02/06/15 if (poly) cor <- "poly" switch(cor, cor = {xvals$raw <- cor(data,use=use,method=method)}, cov = {xvals$raw <- cov(data,use=use) covar <- TRUE}, poly= {xvals$raw <- polychoric(data)$rho}, tet = {xvals$raw <- tetrachoric(data)$rho}, mixed = {xvals$raw <- mixedCor(data)$rho} ) new.data <- as.matrix( merge(xvals$mean,data,by=group,suffixes =c(".bg",""))) #drop the grouping variable(s) new.data <- new.data[,(length(group)+1):ncol(new.data)] diffs <- new.data[,(nvar+1):ncol(new.data),drop=FALSE] - new.data[,1:nvar] #Difference of raw data within group - within group mean #why don't we just use scale for the data within each group? colnames(diffs) <- paste(colnames(new.data)[(nvar + 1):ncol(new.data)], ".wg", sep = "") if(nvar > 1) {xvals$rbg <- cor(new.data[,1:nvar],use="pairwise",method=method)} else {xvals$rbg <- NA} #the between group (means) nBg <- pairwise(xvals$mean)[-gr,-gr] if(any(nBg > 2)) {xvals$ci.bg <-cor.Ci(xvals$rbg,nBg,alpha=alpha,minlength=minlength)} else {xvals.ci.bg=NA} #added the test for all nBg <= 2 so we don't call cor.Ci and throw an error 11/2/19 #t <- rep(NA,(length(nG)-length(gr))) if(all(nG > 2)) { t <- (xvals$rbg*sqrt(nG[-gr]-2))/sqrt(1-xvals$rbg^2) # if(any(nG < 3) ) {#warning("Number of groups must be at least 2") xvals$pbg <- 2*(1 - pt(abs(t),(nG-2))) } else { t <- xvals$pbg <- NA } # xvals$rwg <- cor(diffs,use="pairwise",method=method) #the within group (differences) if(cor %in% c("tet","poly","mixed","mixedCor","mixed.cor") ) cor <- "cor" switch(cor, cor = {xvals$rwg <- cor(diffs,use=use,method=method)}, cov = {xvals$rwg <- cov(diffs,use=use) covar <- TRUE } ) #end of switch xvals$nw <- pairwise(diffs) rwg <- cov2cor(xvals$rwg) t <- (rwg*sqrt(xvals$nw -2))/sqrt(1-rwg^2) #changed to greater than 1, May 20, 2020 if(any(NCOL(rwg) > 1)) {xvals$ci.wg <- cor.Ci(rwg,xvals$nw,alpha=alpha,minlength=minlength)} else {xvals.ci.wg=NA} #added the test for all nBg <= 2 so we don't call cor.Ci and throw an error 11/2/19 # xvals$ci.wg <- cor.Ci(rwg,xvals$nw,alpha=alpha,minlength=minlength) #this is a local function if(all(nG > 2)) { xvals$pwg <- 2*(1 - pt(abs(t),(N[-gr] - nG[-gr] -2)))} else {xvals$pwg <- NA} # colnames(xvals$rwg) <- rownames(xvals$rwg) <- paste(colnames(xvals$rwg),".wg",sep="") xvals$etabg <- diag(cor(new.data[,1:(nvar)],new.data[,(nvar+1):ncol(new.data)],use="pairwise",method=method) )#the means with the data xvals$etawg <- diag(cor(new.data[,(nvar+1):ncol(new.data)],diffs,use="pairwise",method=method)) #the deviations and the data names(xvals$etabg) <- colnames(xvals$rbg) xvals$nWg <- nWg #this is the pairwise number of subjects xvals$nwg <- N - nG #why do we report this? Am I making a mistake xvals$nG <- nG #number of groups if(!is.null(xvals$r)) {xvals$slope <- lapply(xvals$r, function(x) x/diag(x)) ncors <- NCOL(xvals$r[[1]]) * (NCOL(xvals$r[[1]])-1)/2 xvals$slope <- matrix(unlist(lapply(xvals$slope, function(x) x[lower.tri(x), drop=FALSE])),byrow=TRUE,ncol=ncors) colnames(xvals$slope) <- colnames(xvals$within) } else { xvals$slope <- NULL} xvals$Call <- cl statsBy <- xvals class(statsBy) <- c("psych","statsBy") return(statsBy) } cor.Ci <- function(r,n,alpha=.05,minlength=10) { cl <- match.call() z.r <- fisherz(r) sd <-1/sqrt(n-3) nvar <- NCOL(r) ci.lower <- fisherz2r(z.r +( qnorm(alpha/2) * sd)) ci.upper <- fisherz2r(z.r + (qnorm(1-alpha/2) * sd)) cnR <- abbreviate(colnames(r),minlength=minlength) ci.lower <- ci.lower[lower.tri(ci.lower)] ci.upper <- ci.upper[lower.tri(ci.upper)] r.ci <- data.frame(lower=ci.lower,r=r[lower.tri(r)],upper=ci.upper) k <- 1 for(i in 1:(nvar-1)) {for (j in (i+1):nvar) { rownames(r.ci)[k] <- paste(cnR[i],cnR[j],sep="-") k <- k + 1 }} result <-list(r.ci=r.ci) class(result) <- cs(psych,corCi ) return(result) }