## Anscombe's Quartet example
## Ref: Anscombe, F. J. (1973). American Statistician 27 (1): 17–21.
## In this short practical we will build on the basics you have learnt so far
## and create a salient visualisation of the quartet
## (Salient = something that reveals patterns succinctly and clearly)
## my advice is to familiarise yourself with the commands
## and don't feel you have to remember everything
## people have to look things up all the time!!!
## there are also lots of different ways to do the same things
### Remember! a question will appear like this ???
### the code following a question might not work
### you may need to change something..
#____________________________________________________________________________________________
# 1 # In this first section we prepare the data
# You can move through this quickly and onto section 2
# here are the vectors for the raw data
x1 <- c( 10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5)
y1 <- c( 8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84,4.82, 5.68)
x2 <- c( 10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5)
y2 <- c( 9.14, 8.14, 8.74, 8.77, 9.26, 8.1, 6.13, 3.1, 9.13, 7.26, 4.74)
x3 <- c( 10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5)
y3 <- c( 7.46, 6.77, 12.74,7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73)
x4 <- c( 8, 8, 8, 8, 8, 8, 8, 19, 8, 8, 8)
y4 <- c( 6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.5, 5.56, 7.91, 6.89)
# we can double check 'the statistics' to double check anscombe was right (he was)
mean(x1); mean(x2); mean(x3); mean(x4)
var(x1); var(x2); var(x3); var(x4)
mean(y1); mean(y2); mean(y3); mean(y4)
var(y1); var(y2); var(y3); var(y4)
cor(x1, y1); cor(x2, y2); cor(x3, y3); cor(x4, y4)
# to create a data frame we firstly combine the x's in a vector called 'xs'
xs <- c(x1, x2, x3, x4)
# then we combine the y's in a vector called 'ys'
ys <- c(y1, y2, y3, y4)
# and create a label for each cegory
cat <- c(rep(1, length(x1)), rep(2, length(x2)), rep(3, length(x3)), rep(4, length(x4)) )
# finally we bind the columns together and create a data frame
anscTemp <- cbind(xs, ys, cat)
ansc <- as.data.frame( anscTemp )
### can you combine the above code into one line?
# familiarise yourself with the data set
summary(ansc)
#____________________________________________________________________________________________
# 2 # We will now get onto the more intersting task of visualising the data
# the most simple thing we can do is visualise the data frame
# this plots every variable by each other
plot( ansc )
# we can also pick a specific variable
# without any other information, R will plot that variable in order
plot( ansc$xs ) # plots the x's in order that they appear
plot( ansc$ys ) # plots the y's in order that they appear
# or we could plot two variable against each other in a 'scatterplot'
plot( ansc$xs, ansc$ys ) # this plots x's againts y's
# as we have seen, there are a lot of types of plots
# type "l", "b", "h"... and "n"
plot( ansc$xs, ansc$ys , type = "h" ) # type "h" gives histogram style lines
# ??????????????????????
### what is type 'n' ???
#_ we can use 'par' to overlay >1 plot type? E.g. type "p" then "h"
plot( ansc$xs, ansc$ys , col="black", type = "h")
par( new=T )
plot( ansc$xs, ansc$ys , col="red")
#____________________________________________________________________________________________
# 3 # 'par' can control a huge range of features, only some of which we will use here
# Firstly we will ask for more than one plot in the graphics
par( mfrow=c(1,2)) # here we are stating that the graphics panel has 1 row and 2 columns
plot( ansc$xs, ansc$ys, type ='p')
plot( ansc$xs, ansc$ys, type ='l')
# this is really useful as we can see the last thing we drew and compare plots
par( mfcol=c(1,2))
plot( ansc$xs, ansc$ys, type ='p')
plot( ansc$xs, ansc$ys, type ='p', pch = 19, col= "hotpink" , las= 1)
# remember 'pch' changes the point type
# 'col' controls the colour
# and we have use 'las' to change the rotation of the text
# there are lots of types of plot function, such as for histograms
# par can (and should!) be used to set or query graphical parameters.
?par
# ?????????????????????????????????????????????????????????????????
### can you find an argument in 'par' and change the above plot ???
hist (ansc$xs)
?hist # look at the defaults in each function you use,
# e.g. the breaks in the histogram are v important when you create your own
# ????????????????????????????????????????????????????????????
### can you alter the sizes of the 'bins' in the histogram ???
# some plot functions require specific inputs such as a matrix
persp( as.matrix( ansc ) ) # butonly use 3d when its deifintley useful!
#_ back to our anscombe's quartet plot...
#____________________________________________________________________________________________
# 4 # To plot a single category we just need to select that data.
# This choose category '4'
plot( ansc$xs[cat==4], ansc$ys[cat==4] , col="black", type = "p", pch =19)
# to plot all 4 simultaneously we need to alter the layout in par
par(mfrow=c(2,2))
plot( ansc$xs[cat==1], ansc$ys[cat==1] , col="black", type = "p", pch =19)
plot( ansc$xs[cat==2], ansc$ys[cat==2] , col="black", type = "p", pch =19)
plot( ansc$xs[cat==3], ansc$ys[cat==3] , col="black", type = "p", pch =19)
plot( ansc$xs[cat==4], ansc$ys[cat==4] , col="black", type = "p", pch =19)
# we could also be economical and use a for loop
par(mfrow=c(2,2))
for(i in 1:4){
plot( ansc$xs[cat==i], ansc$ys[cat==i] ,
col="black", type = "p", pch =19)
}
# if you haven't seen a for loop before...
# then the number called 'i' loops through all whole values in the brackets
# from 1 to 4 in this case
# the code in side the {curly brackets} is then run for every value of i
# we can use the 'i' variable to set the colour, point type and the title
par(mfrow=c(2,2))
for(i in 1:4){
plot( ansc$xs[cat==i], ansc$ys[cat==i] ,
col=i, type = "p", pch =i, main =paste("cat=", i))
}
# or 'i' could be used to index a list
# for example if we want cutom colours
myCols <- c( colours()[585], colours()[124], colours()[370], colours()[577] )
# colours can be specified in a wide range of ways
# for instance by using the inbuilt list called... 'colours()'
par(mfrow=c(2,2))
for(i in 1:4){
plot( ansc$xs[cat==i], ansc$ys[cat==i] , col=myCols[i], type = "p", pch =i, main =paste("cat=", i))
}
# note that using the for loop makes alot of things easier...
# changing 4 lines of code is time consuming and encourages bugs (mistakes)
# we will now add all the data in the background in a lighter grey
par(mfrow=c(2,2))
for(i in 1:4){
plot( ansc$xs[cat!=i], ansc$ys[cat!=i] , col="grey80", type = "p", pch =19, main =paste("cat=", i))
points( ansc$xs[cat==i], ansc$ys[cat==i] , col=myCols[i], pch=19)
}
# in the first line of code we ask for the data that is not equal to i using '!='
# then we add some points for the data that is equal to i
# this way round the grey points are behind ur focal points
# this looks interesting, but the scales are not all the same
# ???????????????????????????????????????????????????????????????
### can you alter the scale using 'xlim' and 'ylim' in 'plot' ???
par(mfrow=c(2,2))
for(i in 1:4){
plot( ansc$xs[cat!=i], ansc$ys[cat!=i] ,
col="grey80", type = "p", pch =19, main =paste("cat=", i))
points( ansc$xs[cat==i], ansc$ys[cat==i] , col=myCols[i], pch=19)
}
# using 'cex' we could make the main points larger
# and the background points smaller
par(mfrow=c(2,2))
for(i in 1:4){
plot( ansc$xs[cat!=i], ansc$ys[cat!=i] ,
xlim=c( 0, 20 ), ylim=c( 0, 20 ),
col="grey80", type = "p", pch =19, main =paste("cat=", i), cex=0.8) # cex is 1 by default
points( ansc$xs[cat==i], ansc$ys[cat==i] , col=myCols[i], pch=19, cex=1.2)
}
# and we could also add a dashed line using segments (lty=2 is dashed)
# for the line x=y (lwd controls the line width)
# (the first two argumets are x and y for the start and then x and y for the end point)
par(mfrow=c(2,2))
for(i in 1:4){
plot( ansc$xs[cat!=i], ansc$ys[cat!=i] ,
xlim=c( 0, 20 ), ylim=c( 0, 20 ),
col="grey80", type = "p", pch =19, main =paste("cat=", i), cex=0.8)
points( ansc$xs[cat==i], ansc$ys[cat==i] , col=myCols[i], pch=19, cex=1.2)
}
segments (0,0, 100,100, col="grey", lty=2, lwd=0.8) # this is the dashed line
# ??????????????????????????????????????????????????????????????
### can you change this code so every graph has the x=y line ???
### can you make sure it is behind the coloured points ???
#____________________________________________________________________________________________
# 5 # let's finish this plot off more 'professionally'
# to start with let's take control of the height and width
dev.new(width=8, height=2) # can you find out what units these are?
par(mfrow=c(1,4), width=200, height= 100 )
for(i in 1:4){
plot( ansc$xs[cat!=i], ansc$ys[cat!=i] ,
xlim=c( 0, 20 ), ylim=c( 0, 20 ),
col="grey80", type = "p", pch =19, main =paste("cat=", i), cex=0.8)
points( ansc$xs[cat==i], ansc$ys[cat==i] , col=myCols[i], pch=19, cex=1.2)
segments (0,0, 100,100, col="grey", lty=2, lwd=0.8)
}
# the axis labels could be simply 'x' + 'y'
# of course you could do this using 'title'
# and we could take the main title away for now
par(mfrow=c(1,4), width=200, height= 100 )
for(i in 1:4){
plot( ansc$xs[cat!=i], ansc$ys[cat!=i] ,
xlim=c( 0, 20 ), ylim=c( 0, 20 ),
xlab="x", ylab="y",
col="grey80", type = "p", pch =19, main ="", cex=0.8)
points( ansc$xs[cat==i], ansc$ys[cat==i] , col=myCols[i], pch=19, cex=1.2)
segments (0,0, 100,100, col="grey", lty=2, lwd=0.8)
}
# now that we have got rid of a title we can use that spce
# using par and 'mar' we can control the margins of the plot
dev.new(width=8, height=2) # the dimensions of the graphics device
par( mar=c( 4,4,1.5,1.5) ) # the size of the margins around the plot
par(mfrow=c(1,4)) # this sets the panel to 1 row and 4 columns of figures
for(i in 1:4){
plot( ansc$xs[cat!=i], ansc$ys[cat!=i] ,
xlim=c( 0, 20 ), ylim=c( 0, 20 ),
xlab="x", ylab="y",
col="grey80", type = "p", pch =19, main ="", cex=0.8)
points( ansc$xs[cat==i], ansc$ys[cat==i] , col=myCols[i], pch=19, cex=1.2)
segments (0,0, 100,100, col="grey", lty=2, lwd=0.8)
}
# in 'mar' and lots of other R commands the numbers refer to...
# 1= bottom, 2= left, 3= top, 4= right, i.e. clockwise starting at the bottom
# as we removed the title it would make sense to add a label to the plots
# we could use a rectangle as the background to the text
# and then use 'text' for the label
rect(1,16, 4, 19, col=myCols[i], border =F)
# like segments, rect uses two sets of coordinates
# the first two are the bottom left corner, followed by the top right
lbls <- c ("I", "II", "III", "IV" )
# it is easiet to create a list for the labels
text(2.5,17.5, labels =lbls[i], font=2)
# font= 2 is bold
# ?????????????????????????????????????????????????????????????????????
### can you place this in the above code so every panel has a label ???
#____________________________________________________________________________________________
# 6 # We have seen that we can add things to aplot and also take them away
# let us start with a plain plot and see what you can add in
# this code is type 'n' (i.e. no points) and we have asked for no axes
# the labels are just given as blanks
plot(ansc$xs, ansc$ys, type='n', axes=F, xlab="", ylab="", xlim=c(0,20), ylim=c(0,20))
# this plot is blank!
# rememeber that to add an axis you need to state which side it is on
axis(1)
# we can be more specific, for instance if we want to control the gaps
axis(2, at=seq(0,20, by=5))
# we can make these lines a bit thicker using lwd
# and change the text rotation with las
plot(ansc$xs, ansc$ys, type='n', axes=F, xlab="", ylab="", xlim=c(0,20), ylim=c(0,20))
axis(2, at=seq(0,20, by=5), lwd=2, las=1)
# by adding another axis over the top we can create guides between the tickmarks
axis(2, at=seq(0,20, by=1), labels=F )
# if you'd lik a box around the plot then type...
box()
# once you know where to look (?par) there are a host of controls to use
# for instance if you don't like the gaps at the ends of axis
# you can use 'yaxs'
# drawing a paired plot helps show the difference
par(mfrow=c(1,2))
plot(ansc$xs, ansc$ys, type='n', axes=F,
xlab="", ylab="", xlim=c(0,20), ylim=c(0,20),
yaxs='i')
axis(2, at=seq(0,20, by=5), lwd=2, las=1)
axis(2, at=seq(0,20, by=1), labels=F )
box()
plot(ansc$xs, ansc$ys, type='n', axes=F,
xlab="", ylab="", xlim=c(0,20), ylim=c(0,20),
yaxs='r')
axis(2, at=seq(0,20, by=5), lwd=2, las=1)
axis(2, at=seq(0,20, by=1), labels=F )
box()
# there is also a control for x axis
# note that this is set in plot, which defines scales
# everything else (e.g. axes, points, lines...) are defined by the plot
plot(ansc$xs, ansc$ys, type='n', axes=F,
xlab="", ylab="", xlim=c(0,20), ylim=c(0,20),
yaxs='i', xaxs = 'i')
axis(1, at=seq(0,20, by=5), lwd=2, las=1)
axis(1, at=seq(0,20, by=1), labels=F )
axis(2, at=seq(0,20, by=5), lwd=2, las=1)
axis(2, at=seq(0,20, by=1), labels=F )
# and we have seen that 'axis' also takes vectors
# so you can draw ticks where there is data
axis(3, at=ansc$ys, labels=F )
axis(4, at=ansc$xs, labels=F )
#____________________________________________________________________________________________
# 7 # you can add allsorts of things to your plot
# for instance error bars using the previous plot
plot(ansc$xs, ansc$ys, type='n', axes=F, xlab="", ylab="", xlim=c(0,15), ylim=c(0,15),
yaxs='i', xaxs = 'i')
axis(1, at=seq(0,20, by=5), lwd=2, las=1)
axis(1, at=seq(0,20, by=1), labels=F )
axis(2, at=seq(0,20, by=5), lwd=2, las=1)
axis(2, at=seq(0,20, by=1), labels=F )
# lets add some data points first
points( ansc$xs[cat==2], ansc$ys[cat==2], pch=19)
# to create error bars we actually use flat arrows
# which are slight different to points as we need state the start and the end coordinates
arrows( ansc$xs[cat==2], ansc$ys[cat==2]-2, ansc$xs[cat==2], ansc$ys[cat==2]+2 )
# then we need to modify the arrows slightly using the following
arrows( ansc$xs[cat==2], ansc$ys[cat==2]-2, # the start coordinates
ansc$xs[cat==2], ansc$ys[cat==2]+2, # the end coordinates
length = 0.1, # the length of teh arrow heads
angle= 90, # define a flat headed arrow
code=3 # to draw an arrow head at each end
)
# remember I just used +-2 as an example...
# this should be a real confidence estimate!
# ?????????????????????????????????????????????????????????????????
### can you use the cat==3 variable for the confidence estimate ???
### using the x and y for upper and lower confidence estimate ???
# in the end we have a neatly formatted plot
plot(ansc$xs, ansc$ys, type='n', axes=F, xlab="", ylab="", xlim=c(0,15), ylim=c(0,15),
yaxs='i', xaxs = 'i')
points( ansc$xs[cat==2], ansc$ys[cat==2], pch=19)
axis(1, at=seq(0,20, by=5), lwd=2, las=1)
axis(1, at=seq(0,20, by=1), labels=F )
axis(2, at=seq(0,20, by=5), lwd=2, las=1)
axis(2, at=seq(0,20, by=1), labels=F )
arrows( ansc$xs[cat==2], ansc$ys[cat==2]-2, ansc$xs[cat==2], ansc$ys[cat==2]+2,
length = 0.1, angle= 90, code=3)
# ????????????????????????????????????????????????????????????????
### can you add a title and axis labels using the 'title' function
#____________________________________________________________________________________________
# 8 # this really is just the beginning
# from here you know about the main feature of plots
# and you know where to look to find out more
# different packages can be slightly idiosyncratic in what they let you control and how
# each can be a learning experience in itself
# but if the results are worth it then they are worth learning
# to learn about saving your visuals see:
# https://www.stat.berkeley.edu/classes/s133/saving.html
# you can of course save them from the graphics device
# GOOD LUCK WITH R!
# ITS VERY MUCH WORTH THE EFFORT!!