#SAPA Analysis July 2007 #first do the IQ data set iq <- read.clipboard.csv() dim(iq) #total number of records [1] 39996 70 #note this is two more than the sapa.melissa.r.txt analysis because of SATV and SATQ dupli <- duplicated(iq$RID)+0 #finds people with duplicate Random ID numbers iq.nodups <- iq[dupli==0,] #gets rid of the second or more duplicate response iq.clean <- subset(iq.nodups,iq.nodups$age<100) #gets rid of fake data iq.clean <- subset(iq.clean,iq.clean$age>10) #gets rid of fake data weird.iq <- iq.clean==7 #identify those items that are 7 weird.iq <- weird.iq + 0 #convert F/T to 0/1 bad.iq <- rowSums(weird.iq,na.rm=TRUE) #count how many 1's each person has (they should have 0) iq.very.clean <- iq.clean[bad.iq==0,] #just get those with good scores new.iq <- iq.very.clean #make a simpler name dim(new.iq) #[1] 38458 70 iq.items <- new.iq[,15:70] #everybody keys <- c(4,3,4,4,2,3,6,4,4,3,1,2,5,2,1,4,4,5,6,4,5,1,5,2,3,5,2,1,2,3,1,1,3,4,2,3,4,2,3,5,3,5,4,3,5,2,2,4,3,5,3,2,3,1,4,5) iq.items[iq.items==0] <- NA #not responding does not count as getting it wrong scored <- t(t(iq.items)==keys[]) #scores as t/f scored <- scored + 0 #converts t/f to 1/0 #iq <-new.iq iq <- data.frame(new.iq[,2:14],scored) #note that this is dropping the user id and adding the scores iq$SAT[iq$SAT<400] <- NA iq$SAT[iq$SAT>1600] <- NA iq$ACT[iq$ACT>36] <- NA iq$ACT[iq$ACT<1] <- NA iq$SATV[iq$SATV < 200] <- NA iq$SATV[iq$SATV > 800] <- NA iq$SATQ[iq$SATQ < 200] <- NA iq$SATQ[iq$SATQ > 800] <- NA dim(iq) dim(iq) [1] 38446 69 > names(iq) [1] "RID" "no_code" "study_number" "country" "gender" "race" "education" "major" "age" "SAT" "ACT" "SATV" [13] "SATQ" "iq1" "iq2" "iq3" "iq4" "iq5" "iq6" "iq7" "iq8" "iq9" "iq10" "iq11" [25] "iq12" "iq13" "iq14" "iq15" "iq16" "iq17" "iq18" "iq19" "iq20" "iq21" "iq22" "iq23" [37] "iq24" "iq25" "iq26" "iq27" "iq28" "iq29" "iq30" "iq31" "iq32" "iq33" "iq34" "iq35" [49] "iq36" "iq37" "iq38" "iq39" "iq40" "iq41" "iq42" "iq43" "iq44" "iq45" "iq46" "iq47" [61] "iq48" "iq49" "iq50" "iq51" "iq52" "iq53" "iq54" "iq55" "iq56" pairs.panels(iq[,5:13]) round(cor(iq[,5:13],use="pairwise"),2) describe(iq[,5:13]) var n mean sd median mad min max range skew kurtosis se gender 1 38446 1.74 0.44 2 0.00 1 2 1 -1.08 -0.83 0.00 race 2 30322 13.52 3.90 15 0.00 1 16 15 -2.47 4.66 0.02 education 3 38446 2.94 1.55 3 1.48 0 5 5 -0.52 -0.64 0.01 major 4 30322 147.07 130.03 123 182.36 0 344 344 0.22 -1.57 0.75 age 5 38446 27.93 11.46 24 8.90 11 99 88 1.20 0.98 0.06 SAT 6 3378 1193.79 214.84 1200 207.56 400 1600 1200 -0.54 0.71 3.70 ACT 7 8281 25.60 4.92 26 5.93 1 36 35 -0.17 0.11 0.05 SATV 8 2594 610.12 107.26 610 118.61 200 800 600 -0.40 0.00 2.11 SATQ 9 2559 603.54 107.92 600 118.61 200 800 600 -0.42 0.07 2.13 > round(cor(iq[,5:13],use="pairwise"),2) gender race education major age SAT ACT SATV SATQ gender 1.00 0.02 0.07 0.09 0.04 -0.08 -0.02 0.03 -0.12 race 0.02 1.00 0.04 -0.01 0.10 0.08 0.19 0.09 0.09 education 0.07 0.04 1.00 0.42 0.44 0.02 0.16 0.10 0.07 major 0.09 -0.01 0.42 1.00 0.06 -0.07 -0.04 0.03 -0.02 age 0.04 0.10 0.44 0.06 1.00 -0.04 0.12 0.04 0.01 SAT -0.08 0.08 0.02 -0.07 -0.04 1.00 0.65 NA NA ACT -0.02 0.19 0.16 -0.04 0.12 0.65 1.00 0.56 0.58 SATV 0.03 0.09 0.10 0.03 0.04 NA 0.56 1.00 0.59 SATQ -0.12 0.09 0.07 -0.02 0.01 NA 0.58 0.59 1.00 describe.by(iq[,5:13],iq$gender) group: 1 var n mean sd median mad min max range skew kurtosis se gender 1 10082 1.00 0.00 1 0.00 1 1 0 NaN NaN 0.00 race 2 7259 13.36 4.07 15 0.00 1 16 15 -2.29 3.75 0.05 education 3 10082 2.74 1.65 3 1.48 0 5 5 -0.34 -0.95 0.02 major 4 7259 127.02 123.33 86 127.50 0 344 344 0.49 -1.30 1.45 age 5 10082 27.26 11.92 23 7.41 11 99 88 1.36 1.48 0.12 SAT 6 1357 1213.79 222.81 1210 207.56 400 1600 1200 -0.68 0.74 6.05 ACT 7 2091 25.79 5.32 26 5.93 1 36 35 -0.42 0.68 0.12 SATV 8 913 605.81 109.86 600 118.61 200 800 600 -0.47 0.13 3.64 SATQ 9 910 620.50 110.80 620 118.61 200 800 600 -0.56 0.24 3.67 ------------------------------------------------------------------------------------------------------------------------------------------- group: 2 var n mean sd median mad min max range skew kurtosis se gender 1 28364 2.00 0.00 2 0.00 2 2 0 NaN NaN 0.00 race 2 23063 13.57 3.85 15 0.00 1 16 15 -2.53 4.98 0.03 education 3 28364 3.00 1.51 3 1.48 0 5 5 -0.57 -0.50 0.01 major 4 23063 153.38 131.44 133 197.19 0 344 344 0.14 -1.62 0.87 age 5 28364 28.17 11.28 25 8.90 11 99 88 1.14 0.78 0.07 SAT 6 2021 1180.37 208.31 1190 207.56 400 1600 1200 -0.47 0.75 4.63 ACT 7 6190 25.54 4.77 26 4.45 1 36 35 -0.07 -0.20 0.06 SATV 8 1681 612.46 105.78 610 118.61 200 800 600 -0.35 -0.11 2.58 SATQ 9 1649 594.18 105.17 600 118.61 200 800 600 -0.38 0.03 2.59 iq.r <- cor(scored,use="pairwise") #find the correlation matrix #how many factors are in the iq data? fa.parallel(iq.r,n.obs=5000) #suggests 12 factors based on eigen valules iq.vss <- VSS(iq.r) VSS.plot(iq.vss) #this suggests a general factor or 3 factors of complexity 2 #do a hierarchical factor extraction to get g + groups iq.factors <- omega(iq.r) #if we do not drop the missing responses we get a better fit iq.factors $omega [1] 0.6697603 $alpha [1] 0.871291 iq.factors #with items that are skipped not being counted $omega [1] 0.500802 $alpha [1] 0.8125815 iq.labels <- scan("",what=list("","","")) 1 iq_1 math/pattern 2 iq_2 math/pattern 3 iq_3 math/pattern 4 iq_4 math/pattern 5 iq_5 pattern 6 iq_6 pattern 7 iq_7 pattern 8 iq_8 class/general 9 iq_9 class/general 10 iq_10 verbal 11 iq_11 verbal 12 iq_12 verbal/logic 13 iq_13 math 14 iq_14 math 15 iq_15 logic 16 iq_16 logic 17 iq_17 math 18 iq_18 general 19 iq_19 general 20 iq_20 general 21 iq_21 verbal 22 iq_22 logic 23 iq_23 class/general 24 iq_24 logic 25 iq_25 class/general 26 iq_26 verbal 27 iq_27 logic 28 iq_28 class/general 29 iq_29 class/general 30 iq_30 verbal 31 iq_31 logic 32 iq_32 general 33 iq_33 pattern 34 iq_34 pattern 35 iq_35 pattern 36 iq_36 verbal 37 iq_37 pattern 38 iq_38 logic 39 iq_39 general 40 iq_40 class 41 iq_41 general 42 iq_42 general 43 iq_43 matrix 44 iq_44 matrix 45 iq_45 matrix 46 iq_46 matrix 47 iq_47 matrix 48 iq_48 matrix 49 iq_49 matrix 50 iq_50 matrix 51 iq_51 matrix 52 iq_52 matrix 53 iq_53 matrix 54 iq_54 matrix 55 iq_55 matrix 56 iq_56 matrix iq.names <- iq.labels[3] iq.names <- data.frame(iq.labels[3]) colnames(iq.names) <- "content" iq.graph <- file.choose() omega.graph(iq.factors,sl=FALSE,out.file=iq.graph,labels=iq.names) #has a bug in cex? iq.clust <- ICLUST(iq.r) iq.clusters <- factor2cluster(iq.factors$schmid$sl[,2:4],loading=FALSE) ICLUST.graph(iq.clust,out.file=iq.graph) round(factor.congruence(iq.factors$schmid$sl,iq.clust$loadings,loading=FALSE),2) C45 C52 C53 g factor -0.06 1.00 0.82 PA1 -0.03 0.84 0.71 PA2 0.11 0.69 0.50 PA3 -0.04 0.70 0.69 h2 -0.03 0.98 0.77 u2 0.12 0.88 0.83 iq.clusters <- factor2cluster(iq.factors$schmid$oblique,loading=FALSE,cut=.3) t(iq.clusters) %*% iq.clusters PA1 PA2 PA3 PA1 11 0 0 PA2 0 12 0 PA3 0 0 8 round(factor.congruence(iq.factors$schmid$oblique,iq.clust$loadings,loading=FALSE),2) C45 C52 C53 PA1 -0.09 0.78 0.67 PA2 0.14 0.58 0.29 PA3 0.13 -0.64 -0.64 round(factor.congruence(iq.factors$schmid$orthog,iq.clust$loadings,loading=FALSE),2) C45 C52 C53 PA1 -0.08 0.94 0.78 PA2 0.08 0.79 0.51 PA3 0.12 -0.86 -0.79 round(factor.congruence(iq.factors$schmid$orthog,iq.factors$schmid$oblique,loading=FALSE),2) PA1 PA2 PA3 PA1 0.95 0.39 -0.46 PA2 0.38 0.96 -0.32 PA3 -0.55 -0.27 0.94 round(factor.congruence(iq.factors$schmid$orthog,iq.factors$schmid$sl,loading=FALSE),2) g factor PA1 PA2 PA3 h2 u2 PA1 0.95 0.96 0.52 0.53 0.90 0.80 PA2 0.75 0.50 0.95 0.37 0.82 0.70 PA3 -0.87 -0.61 -0.41 -0.95 -0.82 -0.79 > round(factor.congruence(iq.factors$schmid$oblique,iq.factors$schmid$sl,loading=FALSE),2) g factor PA1 PA2 PA3 h2 u2 PA1 0.80 0.96 0.30 0.33 0.73 0.64 PA2 0.53 0.29 0.94 0.14 0.63 0.51 PA3 -0.65 -0.32 -0.22 -0.97 -0.60 -0.61 iq.scored <- data.frame(iq[,5:13],scored) race <- iq.scored[,2] bw <- race bw[bw>1] <- 0 bw[race==15] <- 2 bw[bw==0] <- NA table(bw) 1 2 1685 24499 #note this analysis was incorrect because I did not drop the 0 responses from the iq items #when done correctly, the values go up iq.scored<- data.frame(bw,iq.scored) iqs.r <- cor(iq.scored,use="pairwise") iq.keys <- read.clipboard() iq.scales <- cluster.cor(iq.keys,iqs.r) iq.scales $sd bw gender education age SAT ACT SATV SATQ PA1 PA2 PA3 g C52 sexkey vqkey 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 6.11 6.12 4.39 13.22 18.78 6.75 7.74 $corrected bw gender education age SAT ACT SATV SATQ PA1 PA2 PA3 g C52 sexkey vqkey bw 1.00 0.01 0.04 0.06 0.15 0.24 0.13 0.16 0.19 0.12 0.09 0.16 0.16 -0.08 -0.11 gender 0.01 1.00 0.07 0.04 -0.08 -0.02 0.03 -0.12 -0.08 0.13 -0.05 0.01 -0.01 0.39 0.22 education 0.04 0.07 1.00 0.44 0.02 0.16 0.10 0.07 0.29 0.08 0.17 0.21 0.22 -0.25 -0.19 age 0.06 0.04 0.44 1.00 -0.04 0.12 0.04 0.01 0.17 0.07 0.04 0.12 0.11 -0.10 0.02 SAT 0.15 -0.08 0.02 -0.04 1.00 0.65 NA NA 0.37 0.05 0.22 0.25 0.30 -0.48 -0.44 ACT 0.24 -0.02 0.16 0.12 0.65 1.00 0.56 0.58 0.50 0.14 0.26 0.36 0.38 -0.45 -0.39 SATV 0.13 0.03 0.10 0.04 NA 0.56 1.00 0.59 0.34 0.13 0.10 0.23 0.26 -0.26 -0.14 SATQ 0.16 -0.12 0.07 0.01 NA 0.58 0.59 1.00 0.42 0.09 0.22 0.29 0.33 -0.54 -0.57 PA1 0.17 -0.07 0.26 0.15 0.33 0.44 0.30 0.37 0.78 0.62 0.69 1.04 1.02 -0.77 -0.98 PA2 0.10 0.11 0.07 0.06 0.04 0.12 0.11 0.07 0.47 0.74 0.50 1.00 0.90 0.55 -0.10 PA3 0.07 -0.04 0.14 0.04 0.18 0.21 0.08 0.18 0.49 0.35 0.67 0.96 0.93 -0.69 -1.10 g 0.15 0.01 0.20 0.11 0.23 0.33 0.22 0.27 0.84 0.80 0.72 0.85 1.11 -0.31 -0.80 C52 0.15 -0.01 0.20 0.10 0.28 0.36 0.25 0.31 0.84 0.73 0.71 0.96 0.88 -0.40 -0.88 sexkey -0.05 0.26 -0.17 -0.06 -0.32 -0.30 -0.18 -0.36 -0.45 0.32 -0.38 -0.19 -0.25 0.45 1.49 vqkey -0.08 0.15 -0.13 0.01 -0.31 -0.27 -0.10 -0.39 -0.60 -0.06 -0.63 -0.51 -0.57 0.69 0.48 $size bw gender education age SAT ACT SATV SATQ PA1 PA2 PA3 g C52 sexkey vqkey 1 1 1 1 1 1 1 1 11 12 8 31 49 26 32 #these are now the correct values #keys are taken from omega as well as ICLUST $corrected bw gender education age SAT ACT SATV SATQ sexkey vqkey PA1 PA2 PA3 C1 C2 cg PA1.1 PA2.1 PA3.1 g bw 1.00 0.01 0.04 0.06 0.15 0.24 0.13 0.16 -0.08 -0.11 0.20 0.05 0.10 0.18 0.09 0.16 0.22 0.00 0.09 0.19 gender 0.01 1.00 0.07 0.04 -0.08 -0.02 0.03 -0.12 0.38 0.23 -0.06 0.10 -0.08 -0.06 -0.06 -0.06 -0.06 -0.06 -0.07 -0.08 education 0.04 0.07 1.00 0.44 0.02 0.16 0.10 0.07 -0.24 -0.20 0.32 0.08 0.19 0.29 0.20 0.28 0.33 0.11 0.20 0.32 age 0.06 0.04 0.44 1.00 -0.04 0.12 0.04 0.01 -0.11 0.01 0.18 -0.01 0.03 0.14 0.05 0.12 0.20 0.07 0.05 0.17 SAT 0.15 -0.08 0.02 -0.04 1.00 0.65 NA NA -0.47 -0.46 0.44 0.29 0.25 0.38 0.25 0.36 0.42 0.23 0.26 0.42 ACT 0.24 -0.02 0.16 0.12 0.65 1.00 0.56 0.58 -0.43 -0.39 0.51 0.24 0.28 0.46 0.31 0.44 0.53 0.15 0.29 0.50 SATV 0.13 0.03 0.10 0.04 NA 0.56 1.00 0.59 -0.27 -0.15 0.39 0.27 0.12 0.31 0.12 0.26 0.39 0.17 0.10 0.33 SATQ 0.16 -0.12 0.07 0.01 NA 0.58 0.59 1.00 -0.54 -0.58 0.47 0.31 0.24 0.45 0.26 0.41 0.44 0.18 0.24 0.43 sexkey -0.06 0.26 -0.17 -0.08 -0.32 -0.30 -0.19 -0.37 0.48 1.51 -0.71 0.45 -0.79 -0.74 -0.72 -0.80 -0.73 -0.45 -0.80 -0.86 vqkey -0.08 0.16 -0.14 0.01 -0.32 -0.27 -0.11 -0.41 0.73 0.49 -0.91 -0.23 -1.02 -1.03 -1.08 -1.14 -0.86 -0.18 -1.11 -1.03 PA1 0.18 -0.05 0.28 0.16 0.40 0.46 0.35 0.43 -0.44 -0.58 0.81 0.54 0.68 1.15 0.72 1.08 1.16 0.15 0.68 1.10 PA2 0.02 0.04 0.04 -0.01 0.13 0.11 0.12 0.14 0.14 -0.07 0.22 0.21 0.44 0.45 0.46 0.49 0.51 0.83 0.46 0.62 PA3 0.08 -0.06 0.14 0.03 0.19 0.21 0.09 0.18 -0.41 -0.54 0.47 0.15 0.57 0.68 1.45 1.04 0.64 0.15 1.56 1.03 C1 0.16 -0.05 0.25 0.12 0.33 0.40 0.27 0.39 -0.44 -0.63 0.90 0.18 0.45 0.75 0.69 1.19 1.17 0.12 0.67 1.10 C2 0.07 -0.05 0.16 0.04 0.20 0.25 0.09 0.21 -0.40 -0.61 0.52 0.17 0.88 0.48 0.64 1.09 0.66 0.13 1.50 1.02 cg 0.14 -0.06 0.25 0.10 0.32 0.39 0.23 0.37 -0.49 -0.72 0.87 0.20 0.71 0.92 0.78 0.80 1.07 0.14 1.06 1.16 PA1.1 0.19 -0.05 0.29 0.18 0.37 0.47 0.34 0.39 -0.44 -0.54 0.93 0.21 0.43 0.90 0.47 0.85 0.79 0.16 0.63 1.16 PA2.1 0.00 -0.06 0.11 0.07 0.23 0.15 0.17 0.18 -0.31 -0.13 0.13 0.38 0.12 0.11 0.11 0.12 0.14 1.00 0.15 0.27 PA3.1 0.07 -0.06 0.16 0.04 0.20 0.23 0.08 0.19 -0.44 -0.62 0.49 0.17 0.93 0.46 0.95 0.75 0.44 0.12 0.63 1.03 g 0.17 -0.07 0.29 0.15 0.38 0.45 0.30 0.38 -0.53 -0.65 0.89 0.25 0.70 0.86 0.74 0.93 0.93 0.25 0.74 0.81 $size bw gender education age SAT ACT SATV SATQ sexkey vqkey PA1 PA2 PA3 C1 C2 cg 1 1 1 1 1 1 1 1 26 32 33 12 11 14 8 22 PA1.1 PA2.1 PA3.1 g 16 1 8 25 iq.rid <- data.frame(RID=iq$RID,iq.scored) ################### ipip <- tony dim(ipip) [1] 40233 213 ipip <- ipip.clean[,-c(10,9,5,4,3,1)] ipip[ipip==0] <- NA ipip.r <- cor(ipip,use="pairwise") #this will have some missing values tony.music <- read.clipboard.csv() #but this did not get the music names so music <- tony.music #colnames(tony.music) <- c("SID", "RID",paste("x",1:16,sep=""),paste("m",1:60,sep="") ) d1 <- describe(music) #basic descriptives d12<- d1[19:78,] d12 <- describe(music[,19:78]) #just the items stats music.items <- music[,19:78] #just the music items music.items[music.items ==0] <- NA music.knowledge <- rowSums((music.items>1)+0,na.rm=TRUE) #this counts the number of music items a person knows music.items[music.items==1] <- NA # music == 1 => you don't know the genre d2 <- describe(music.items) #compare this one to the previous to detect how many don't know diff.num <- d12 - d2 # how many people don't know about this genre diff.ratio <- diff.num/d12 #express this as a ratio to number who answered this item round(diff.ratio$n,2) round(diff.ratio$n,2) #this measures who did not respond #[1] 0.02 0.03 0.03 0.03 0.03 0.04 0.03 0.03 0.03 0.04 0.03 0.02 0.04 0.03 0.03 0.02 0.04 0.03 0.03 0.03 0.04 0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.02 [32] 0.03 0.03 0.03 0.03 0.03 0.04 0.03 0.03 0.03 0.04 0.03 0.03 0.03 0.03 0.02 0.03 0.03 0.03 0.03 0.04 0.03 0.03 0.03 0.04 0.03 0.02 0.02 0.02 0.02 #this measures who did not know about it, but did respond as a 1 d2 <- describe(music.items) #compare this one to the previous to detect how many don't know > diff.num <- d12 - d2 # how many people don't know about this genre > diff.ratio <- diff.num/d12 #express this as a ratio to number who answered this item > round(diff.ratio$n,2) [1] 0.03 0.07 0.12 0.05 0.07 0.07 0.07 0.11 0.22 0.26 0.46 0.04 0.16 0.14 0.44 0.09 0.14 0.30 0.33 0.14 0.19 0.23 0.14 0.39 0.12 0.34 0.23 0.23 0.19 0.07 0.28 [32] 0.20 0.08 0.07 0.13 0.15 0.07 0.06 0.04 0.09 0.09 0.14 0.07 0.07 0.06 0.08 0.17 0.10 0.23 0.16 0.18 0.14 0.17 0.17 0.11 0.20 0.05 0.06 0.06 0.07 new.music <- data.frame(RID = music[,2],music.items,know=music.knowledge) music <- new.music #just to make the rest easier to understand iq.final <- data.frame(RID=iq$RID,iq.scored) iq.music<- merge(iq.final,music,by="RID") dim(iq.music) [1] 39933 128 ipip.final <- data.frame(RID=ipip.clean$RID,ipip.clean[,c(12:210)]) ipip.final[ipip.final==0] <- NA iq.music.ipip <- merge(iq.music,ipip.final,by="RID") dim(ipip.final) [1] 38586 204 dim(music.iq) [1] 39933 117 > dim(iq.final) [1] 38446 57 > dim(music) [1] 40589 61 iq [1] 39933 320 names(ipip.music.iq) all.r <- cor(iq.music.ipip[,2:327],use="pairwise") music.vss <- VSS(all.r[67:126,67:126],n.obs=5000) VSS.plot(music.vss) #four factors of complexity 2 music.vss <- VSS(all.r[67:126,67:126]) #4 factors of complextity 2 music.cluster <- ICLUST(all.r[67:126,67:126]) # 5 clusters but two big ones $purified$corrected C53 C30 C3 C42 C55 C53 0.77 0.23 0.12 0.37 0.28 C30 0.16 0.66 0.25 0.18 0.24 C3 0.09 0.18 0.77 -0.07 0.36 C42 0.26 0.12 -0.05 0.66 0.34 C55 0.23 0.18 0.30 0.26 0.88 $purified$size C53 C30 C3 C42 C55 11 5 3 4 37 music.clust <- music.cluster$clusters music.clusterall <- ICLUST(iq.music.ipip[,68:127]) molden items are 289 - 298 molden.clus <- ICLUST(all.r[289:298,289:298])