Simulations of distributions

The central limit theorem is perhaps the most important concept in statistics. For any distribution with finite mean and standard deviation, samples taken from that population will tend towards a normal distribution around the mean of the population as sample size increases. Furthermore, as sample size increases, the variation of the sample means will decrease.

The following examples use the R stats program to show this graphically. The first example uses a uniform (rectangular) distribution. An example of this case is of a single die with the values of 1-6. The second example is of two dice with totals ranging from 2-12. Notice that although one die produces a rectangular distribution, two dice show a distribution peaking at 7. The next set of examples show the distribution of sample means for samples of size 1 .. 32 taken from a rectangular distribution.

This figure was produced using the following R code.


#distributions of a single six sided die
#generate a uniform random distribution from min to max

numcases <- 10000                           #how many cases to generate
min <- 1                                  #set parameters
max <- 6
x <- as.integer(runif(numcases,min,max+1) )        #generate random uniform numcases numbers from min to max
                                        #as.integer truncates, round converts to integers, add .5 for equal intervals 
par(mfrow=c(2,1))                        #stack two figures above each other
hist(x,main=paste( numcases," roles of a single die"),breaks=seq(min-.5,max+.5,1))          #show the histogram
boxplot(x, horizontal=TRUE,range=1)              # and the boxplot
title("boxplot of a uniform random distribution")
#end of first demo

Distribution of two dice

Distribution of two dice. The sum of two dice is not rectangular, but is peaked at the middle (hint, how many ways can you get a 2, a 3, ... a 7, .. 12.).
The following R code produced this figure.


#generate a uniform random distribution from min to max for numcases samples of size 2
numcases <- 10000                            #how many cases to generate
min <- 0                                    #set parameters
max <- 6
x <- round(runif(numcases,min,max)+.5)+round(runif(numcases,min,max)+.5)
par(mfrow=c(2,1))                        #stack two figures above each other
hist(x,breaks=seq(1.5,12.5),main=paste( numcases," roles of a pair of dice"))          #show the histogram                                 
boxplot(x, horizontal=TRUE,range=1)      # and the boxplot
title("boxplot of samples of size two taken from a uniform random distribution")

#end of second demo

Samples from a continuous uniform random distribution

We can generalize the case of 1 or two dice to the case of samples of varying size taken from a continuous distribution ranging from 0-1. This next simulation shows the distribution of samples of sizes 1, 2, 4, ... 32 taken from a uniform distribution. Note, for each sample, we are finding the average value of the sample, rather than the sum as we were doing in the case of the dice.

##show distribution of sample means of varying size samples 
  numcases <- 10000               #how many samples to take?
  min <- 0                        #lowest value
  max <- 1
  ntimes <- 6
 op<-  par(mfrow=c(ntimes,1))        #stack ntimes graphs on top of each other
 i2 <- 1                     #initialize counters
 
  for (i in 1:ntimes)               #repeat n times
   {  sample=rep(0,numcases)        #create a vector  
    k=0                #start off with an empty set of counters
    for (j in 1:i2)          #  inner loop 
    {
  		sample <- sample +runif(numcases,min,max) 
     	 k <- k+1  }
  x <- sample/k
  out <- c(k,mean(x),sd(x))
  #print(out,digit=3)
  hist(x, xlim=range(0,1),prob=T ,main=paste( "samples of size",  k ),col="black")
  i2 <- 2*i2
  }    #end of i loop

  
  
 #do the same thing, but this time show a boxplot 
  numcases <- 10000               #how many samples to take?
  min <- 0                        #lowest value
  max <- 1
  ntimes <- 6
  par(mfrow=c(ntimes,1))            #stack ntimes graphs on top of each other
 i2 <- 1                     #initialize counters
 
  for (i in 1:ntimes)               #repeat 5 times
   {  sample <- 0 ; k <- 0                #start off with an empty set of counters
    for (j in 1:i2)          #  inner loop 
    {
  		sample <- sample +runif(numcases,min,max) 
     	 k <- k+1  }
  x <- sample/k
  out <- c(k,mean(x),sd(x))
 boxplot(x, horizontal=TRUE, ylim=c(0,1), range=1,notch=T)              # and the boxplot
  i2 <- 2*i2}
  

#Demonstration of the effect of sample size on distributions
#each sample is then replicated numcases times 
  filename <- "distributions2.pdf"
pdf(filename,width=6, height=9) 

numcases <- 10000               #how many samples to take?
  min <- 0                        #lowest value
  max <- 1
  ntimes <- 3
  par(mfrow=c(2*ntimes,1))            #stack ntimes graphs on top of each other
 i2 <- 1                     #initialize counters
 
  for (i in 1:ntimes)               #repeat 5 times
   {  sample <- 0 ; k <- 0                #start off with an empty set of counters
    for (j in 1:i2)          #  inner loop 
    {
  		sample <- sample +runif(numcases,min,max) 
     	 k <- k+1  }
  x <- sample/k
  out <- c(k,mean(x),sd(x))
  print(out,digit=3)
  hist(x, xlim=range(0,1),prob=T ,main=paste( "samples of size",  k ))
 boxplot(x, horizontal=TRUE, ylim=c(0,1), range=1,notch=T)              # and the boxplotl
 titles("Samples of sise ",
  i2 <- 8*i2}
  
    dev.off()
#ÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄ  
#choose a different type of distribution (binomial)
filename <- "distributions3.pdf"
pdf(filename,width=6, height=9) 

  numcases <- 1000               #how big is the population
  min <- 0                        #lowest value
  max <- 1
  ntimes <- 6
  par(mfrow=c(ntimes,1))            #stack ntimes graphs on top of each other
 i2 <- 1                     #initialize counters
 
  for (i in 1:ntimes)               #repeat 5 times
   {  sample <- 0 ; k <- 0                #start off with an empty set of counters
    for (j in 1:i2)          #  inner loop 
    {
  		sample <- sample +  rbinom(numcases, 1, .5)
     	 k <- k+1  }
  x <- sample/k
  out <- c(k,mean(x),sd(x))
  #print(out,digit=3)
  hist(x, xlim=range(0,1),prob=T ,main=paste( "samples of size",  k ))
# boxplot(x, horizontal=TRUE, ylim=c(0,1), range=1,notch=T)              # and the boxplot
  i2 <- 2*i2}
  
   dev.off()
   
#######
#choose a different type of distribution (binomial)
filename <- "distributions4.pdf"
pdf(filename,width=6, height=9) 

  numcases <- 1000               #how big is the population
  min <- 0                        #lowest value
  max <- 1
  ntimes <- 6
  samplesize <- 2
  prob <- .5
  par(mfrow=c(ntimes,1))            #stack ntimes graphs on top of each other
 i2 <- 1                     #initialize counters
 
  for (i in 1:ntimes)               #repeat 5 times
   {  sample <- 0 ; k <- 0                #start off with an empty set of counters
    for (j in 1:i2)          #  inner loop 
    {
  		sample <- sample +  rbinom(numcases, samplesize, prob)            #samples of size 2
     	 k <- k+1  }
  x <- sample/k
  out <- c(k,mean(x),sd(x))
  #print(out,digit=3)
  hist(x, xlim=range(0,samplesize),prob=T ,main=paste(numcases," samples of size",  k," with prob = ", prob))
# boxplot(x, horizontal=TRUE, ylim=c(0,samplesize), range=1,notch=T)              # and the boxplot
  i2 <- 2*i2}
    dev.off()
  
 ######
#choose a different type of distribution (binomial)
filename <- "distributions5.pdf"
pdf(filename,width=6, height=9) 

  numcases <- 1000               #how big is the population
  min <- 0                        #lowest value
  max <- 1
  ntimes <- 6
  samplesize <- 10
  prob <- .2
  par(mfrow=c(ntimes,1))            #stack ntimes graphs on top of each other
 i2 <- 1                     #initialize counters
 
  for (i in 1:ntimes)               #repeat 5 times
   {  sample <- 0 ; k <- 0                #start off with an empty set of counters
    for (j in 1:i2)          #  inner loop 
    {
  		sample <- sample +  rbinom(numcases, samplesize, prob)            #samples of size 
     	 k <- k+1  }
  x <- sample/k
  out <- c(k,mean(x),sd(x))
  #print(out,digit=3)
  hist(x, xlim=range(0,samplesize),prob=T ,main=paste( numcases," samples of size",  k," with prob = ", prob))
# boxplot(x, horizontal=TRUE, ylim=c(0,samplesize), range=1,notch=T)              # and the boxplot
  i2 <- 2*i2}
 
  dev.off()
  
  
 ########

#choose a different type of distribution (binomial)

# Set some parameters
  numcases <- 1000               #how big is the population
  ntimes <- 6
  samplesize <- 1                 #initial sample size
  prob <- .5                      #probability of outcome
  par(mfrow <- c(ntimes,1))       #stack ntimes graphs on top of each other
  i2 <- 1                     #initialize counters
 
 
  for (i in 1:ntimes)               #repeat ntimes
   
    {                               #begin the loop
    	x <- rbinom(numcases, samplesize, prob)            #samples of size 
    	x <- x/samplesize               #normalize to the 0-1 range
     	out <- c(samplesize,mean(x),sd(x))
  		print(out,digit=3)
 		 hist(x, xlim=range(0,1),prob=T ,main=paste( numcases," samples of size",  samplesize ," with prob = ", prob))
# boxplot(x, horizontal=TRUE, ylim=c(0,samplesize), range=1,notch=T)              # and the boxplot
  samplesize <- samplesize*2         #double the sample size
  }                               #end of loop
 

  
Some simple measures of central tendency
 #######
  x=c(1,2,4,8,16,32,64)    #enter the x data
  summary(x)
  boxplot(x)
  
  
  x <- c(1,2,4,8,16,32,64)    #enter the x data
  y <- c(10,11,12,13,14,15,16) #enter the y data                  
  data <- data.frame(x,y)      #make a dataframe
  data                      #show the data
  summary(data)             #descriptive stats
  boxplot(x,y)              #same as boxplot(data)
  
  
   

part of a short guide to R
Version of April 1, 2005
William Revelle Department of Psychology
Northwestern University