# Lines that are not preceeded by a "#" can # be pasted in to R at the ">" prompt. # "#" is R's symbol for "comment out what follows on # the current line" # # One way to use this script is to interactively # paste each command into R's command line. # # Before you start, create a directory called: # c:/learnR/ (on a PC) # or ~/learnR on linux/unix # put the files test1.dat, test2.dat, test3.dat, test4.dat, cheese.dat, and # dots.R into that directory. # They can be found at http://www.math.umass.edu/~jstauden/dots.R, etc. # # The script is not meant to be an exhaustive overview # of R. Rather, it is intended to give you a start # with some tools. Further exploration is encouraged... # # Please send feedback on this tutorial to jstauden@math.umass.edu # # # F U N D A M E N T A L S O F R # # Based on a # A Tutorial by Matt Wand # Harvard School of Public Health # (Modified by John Staudenmayer) # # OUTLINE # 1. FUNCTIONS # * Basic syntax # * Optional arguments # * Construction of complex # defaults using the missing() # function. # # 2. DOCUMENTATION # * Use of on-line help # # 3. NUTS AND BOLTS # * How to exit R # * How to manage your programs # * How to create a function object. # * The file structure of R # # 4. DATA STRUCTURES # * arrays # * matrices # * multi-dimensional arrays # * lists # * data frames # # 5. INPUT AND OUTPUT # * reading data from a file # * putting data into a file # # 6. GRAPHICS # * Ordinary plots # * Use of the par() function # * Lines, points and polygons # * Surface plots and image plots and coplots. # * Some other plots # * Obtaining color Postcript files # # 7. DATA MANIPULATION # * rep(), aa[2:4], aa[-2] etc # * rev() # * extracting with respect to a condition. # * apply() and lapply() # * matrix operations # * sorting with respect to columns # including secondary sorting. # # # # 1. FUNCTIONS # # Here is a function that finds the average of a # set of numbers (paste the the text until HERE into the command window), # in general the function is: # functionname <- function(args) #{ # start of function # # the work of the function... # # return(funciton output) #} # end of function # average <- function(x) { n <- length(x) # <- or = can be used as the assignment operator in R sumval <- 0 for (i in 1:n) { sumval <- sumval + x[i] } ans <- sumval/n return(ans) } # A simpler version of average() is: average <- function(x) { n <- length(x) return(sum(x)/n) } # An even more concise version of average is: average <- function(x) return(sum(x)/length(x)) # To "use" the function, we creat and x and give it to the function: x <- runif(20) # 20 variates from a U(0,1) distibution x average(x) # We will give an example of an optional argument. convert <- function(dept.name,univ.name="UMass",year="2000",start.date) { # Here is an example of how to do a more complicated default # assignment. if (missing(start.date)) { if (univ.name=="UMass") start.date <- "early September" if (univ.name=="Tufts") start.date <- "late August" if (univ.name=="MIT") start.date <- "mid September" } if(dept.name=="BIO") full.dept.name <- "Biostatistics" # This is an example of a comment. # Because there is a # at the left, # it is not read by R. Note # that we use == to test equality # two objects. if(dept.name=="EPI") full.dept.name <- "Epidemiology" if(dept.name=="EH") full.dept.name <- "Environmental Health" full.name <- paste(full.dept.name,"at",univ.name,"in",year, "starts in",start.date) return(full.name) } # try the function: convert("EPI","MIT") # We don't need to respect the order in the argument # list if we use the argument name: # e.g. convert(year="1984",dept.name="EPI") # 2.DOCUMENTATION # A help system can be started with # help.start() in R. # Help on a specific function can be obtained # by typing ?functionname e.g. ?glm # When searching a function to perform # a specific task, it is sometimes worthwhile # to look at the help file for a function # that does a related task and read the # "SEE ALSO" part. # 3. NUTS AND BOLTS # To exit R, the appropriate function is # Don't paste the following unless you want to quit q() # Everything to the right of the # character is ignored, # and hence is used for comments. # Programs can be written in text files and "sourced" into R. # For instance, go to http://www.math.umass.edu/~jstauden/dots.R # and save the text as c:/learnR/dots.R. source("c:/learnR/dots.R") # You can also open the file dots.R to see what it does. # You'll learn many of the commands later, but note the structure: # the script both creates a function and calls it. #NOTE: when creating the file dots.R # it is important to have a carriage # return at the end of the last line # of text. # To remove an object (e.g. bigmatrix) during an R # session type: rm(bigmatrix) # 4. DATA STRUCTURES # Arrays # ------ # To construct an array "x" # containing the numbers # 8, 1, 11 and -4 # type: x <- c(8,1,11,-4) # The function "c" (for concatenate?) combines # objects into an array. x1 <- c(3,26) x2 <- c(8,12,-79) x3 <- c(x1,x2) x3 # Here is an array of character strings: strings <- c("bill","judy") strings # Entries of an array are referenced through # their index using []'s. # e.g. strings[2] # Here's another example: x3 x3[4] # To reference the entries of # x3 between 2 and 4 use: x3[2:4] # In fact, if "a" and "b" and integers # then a <- 2 b <- 5 a:b # contructs an array of integers # between a and b. # Matrices # -------- # The easiest way to create a matrix is to # use the function cbind(): ("column bind") # (guess what rbind() does...) # e.g. col1 <- c(3,7,1) col2 <- c(2,0,11) A <- cbind(col1,col2) A # Another way is to use the function matrix(): # # e.g. r <- runif(15) r rmat <- matrix(r,5,3) rmat # Note that, by default, it fills rmat by # columns. To fill by rows one would need to # type rmat <- matrix(r,5,3,byrow=T) # This gives: rmat # note also that there is a transpose command t(rmat) # matrix addition: A+A # matrix multiplication b <- runif(3) b rmat %*% b # note how b is treated as a column vector # by default, the outcome is a matrix. # to make it a vector: as.vector(rmat%*%b) # following is "elementwise" b*b # We reference entries of matrices through # statements such as: rmat[4,2] # The 2nd column of "rmat" is referenced # through: rmat[,2] # 1st row: rmat[1,] # Note that when you pick out a column or a row # R converts to an ordinary array, not # a matrix. If you really want a matrix you # need to use the function "as.matrix()" rmat.2 <- as.matrix(rmat[,2]) rmat.2 # note also: is.matrix(rmat[,2]) is.vector(rmat[,2]) is.matrix(rmat.2) is.vector(rmat.2) # Multidimensional arrays # ----------------------- # The idea of indices can be # extended from 2 (for a matrix) # to several. # e.g. u <- rnorm(8) #random standard normal u uarray <- array(u,c(2,2,2)) uarray[1,2,1] #and the full 2 by 2 by 2 array is: uarray # Lists # ----- # To construct a list consisting of names, # Id number and building code for the IEQ # study we could type (as a simple example), IEQ.names <- c("Ann Taylor","David Sedaris", "Ira Glass") idnums <- c(49011,43911,78452) blg.num <- c(5,5,2,6) IEQ.cohort <- list(name=IEQ.names,id=idnums, blg=blg.num) IEQ.cohort IEQ.cohort[[1]] IEQ.cohort[[1]][2:3] # Another example: # We have 3 teams of workers, A, B and C A.group <- c("Bill","John","Betty") B.group <- c("George","Bob") C.group <- c("Harry","Kevin","Jill","Judy") myworkers <- list(A=A.group,B=B.group,C=C.group) D.group <- list(full.timers=c("Ken","Jerry"), part.timers=c("Fred","Kerry")) my.new.workers <- list(A=A.group,B=B.group, C=C.group,D=D.group) my.new.workers my.new.workers$A my.new.workers[[1]] my.new.workers$D$part.timers[2] # To construct a list containing names, exam # scores and SSN's of students in the class: # Sclass <- list(class.names=class.names, # hw.scores=hw.scores,SSN=SSN) # To reference a component of the list we use the # $ sign with the name of the component: # e.g. IEQ.cohort$name IEQ.cohort$name[3] # Often we don't have meaningful names for # the components of our list. # e.g. if we had the matrices u1 <- matrix(runif(4),2,2) u2 <- matrix(runif(4),2,2) # and then formed the list: ulist <- list(u1,u2) # then we reference the first # component of "ulist" using ulist[[1]] # Data frame # ---------- # You can think of a data frame as being # like a matrix, but it has more structure # in that you can, for example, put # labels on the rows and columns. # An example is: # asydreal # region Num92 Ave92 Num93 Ave93 # Abbotsford 3 8 373 10 350 # Alexandria 2 37 169 25 178 # Allawah 4 8 168 11 186 # Annandale 3 86 231 77 237 # Arncliffe 4 18 183 29 194 # Artarmon 1 37 314 32 317 # Ashbury 3 6 207 14 208 # Ashfield 3 42 189 48 231 # Auburn 5 19 128 31 137 # Avalon 1 20 401 32 380 # which has row labels corresponding to suburbs # of Sydney, Australia, and column labels corresponding # to certain real estate variables. # We will now show how to construct such a data # frame. For simplicity we will just do the # first 2 columns. # First create the columns: col1 <- c(3,2,4,2,4,1,3,3,5,1) col2 <- c(8,37,8,86,18,37,6,42,19,20) # Now combine these columns into a matrix: a2mat <- cbind(col1,col2) # This gives: a2mat # Now convert the matrix to a data frame using # the function "as.data.frame()": a2sydreal <- as.data.frame(a2mat) # A faster way to do this is a2sydreal <- data.frame(col1,col2) #This gives: a2sydreal # Finally, we stick labels on the rows and columns. # The row and column names are accessed through # the function "dimnames()". For the current # version of "a2sydreal", try: dimnames(a2sydreal) #You should see: #> dimnames(a2sydreal) #[[1]]: # [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" # #[[2]]: #[1] "col1" "col2" # We see from this that "dimnames(a2sydreal)" is a list # where the first component is an array of character strings # containing each of the row labels, and the second component # is an array of character strings containing each of the column # labels. # To make the labels correspond to the suburbs and real estate #terms, first obtain an array containing the row names: rows <- c("Abbotsford","Alexandria","Allawah", "Annandale","Arncliffe", "Artarmon","Ashbury","Ashfield", "Auburn","Avalon") # Now type: dimnames(a2sydreal)[[1]] <- rows # For the column names we could just type: dimnames(a2sydreal)[[2]] <- c("region","Num92") # now look at a2sydreal # a fully labelled data frame! # We can reference, for example, the "Num92" data # by typing: a2sydreal$Num92 # 5. INPUT AND OUTPUT # The most basic function for reading in data from # a file is "scan()". #For example, the file "test1.dat" looks like: # 8 3 4 5 1 3 4 5 6 3 1 32 2 # 8 9 4 # 3 4 1 # 3 4 # # 4 # # 5 # 6 6 # download this file from http://www.math.umass.edu/~jstauden/test1.dat # and save it to c:/learnR/test1.dat # If we type: data <- scan("c:/learnR/test1.dat") # we get: data # If, instead, test1.dat had looked like: # file containing test data # 8 3 4 5 1 3 4 5 6 3 1 32 2 # 8 9 4 # 3 4 1 # 3 4 # # 4 # # 5 # 6 6 # then R will complain about the non-numbers at # the top. We then need to look at some options of # the scan() function to overcome this problem. # In this case we could say: # data <- scan("c:/learnR/test1.dat",skip=1) # Now consider data such as: # alex 4 6 9 f # joan 3 4 6 f # fred 4 6 9 m # stored in a file called "test2.dat" # (download and save from http://www.math.umass.edu/~jstauden/test2.dat # # To read data like this we need the "what" # argument of scan. data <- scan("~/research/learnR/test2.dat",what=list(character(), numeric(),numeric(),numeric(),character())) # Suppose "test2.dat" had looked like: # alex 4 6 9 f # joan 3 4 6 f # fred 4 6 9 m # charles 7 # The above command doesn't give a data frame since scan() wants the # same number of things in each row. # To overcome this problem we could edit the file to be: # alex 4 6 9 f # joan 3 4 6 f # fred 4 6 9 m # charles 7 NA NA NA # Note that "NA" is R's symbol for a missing data value. # read.table() function # ---------------------- # If the data are in a tabular format then # the function read.table() is usually easier # to use. # (Download and save # http://www.math.umass.edu/~jstauden/test3.dat # http://www.math.umass.edu/~jstauden/test4.dat) # http://www.math.umass.edu/~jstauden/test5.dat) # The file test3.dat looks like: # 4 5 2 # 2 3 4 # 7 8 9 # 2 4 5 # Then data <- read.table("~/research/learnR/test3.dat") # gives a data frame named "data" that # looks like: data # Next, test4.dat looks like: # Boston Washington Concord # 4 5 2 # 2 3 4 # 7 8 9 # 2 4 5 # Then data <- read.table("c:/learnR/test4.dat",header=T) data # Finally, suppose that we had row labels like: # (test5.dat) # Boston Washington Concord # 1994 4 5 2 # 1995 2 3 4 # 1996 7 8 9 # 1997 2 4 5 # Then this can be read in as a correctly labelled # data frame by simply typing: data <- read.table("~/research/learnR/test5.dat") data #also try: row.names(data) names(data) attributes(data) # i.e. R assumes that the first row # corresponds to column names # and that the first column corresponds # to row names. # try ?read.table # to read about separators ("csv", etc) # OUTPUT # The main function for outputting data to # a file is write(). #If we have matrix in R named "data: #that looks like: data <- rbind(c(4,5,2), c(2,3,4), c(7,8,9), c(2,4,5)) # Then to write this to a file called "test.out" # we need to use: write(t(data),"c:/learnR/test.out",ncol=3) # This will lead to a file called "test.out" looking # like the original matrix. # Note here that t(data) corresponds to the # transponse of the matrix "data". We need # to work with the transpose because # write() reads column-wise, but writes # row-wise. # Stuff can be also be output to a file # using the cat() and sink() functions. # cat() prints exactly what is requested. # e.g. cat("ABC") # you got # # ABC> # Note that we haven't even got a new line. # To get a line we need to add "\n", which # signifies a new line. cat("ABC\n") # i.e. a new line is started. # If one says sink("out.dat") # then everything from now onwards # is sent to the file out.dat instead # of the screen. # If I type: # > sink("out.dat") # > cat("ABC\n") # then "ABC" does not appear on the screen, # instead the file out.dat becomes: # ABC # Note that the default output place of sink() # is the screen, so if I now say sink() # then stuff will go back to the screen. # 6. GRAPHICS # Simple plots # ------------ # These are done using the function plot(). # It has two main arguments, x-values and y-values. # The simplest example is a scatterplot. x <- runif(50) y <- 2*x + rnorm(50)*.05 # To obtain a scatterplot I type: plot(x,y) # Plotting has options e.g. plotting character, labels # etc. All these can be tweaked, and the function # that controls this is called par(). Some plotting parameters # are global, so that need to be called using par(), # others are local and can be called in the plot() function. # You need to use the help file for the par() # function rather than the plot() function # to get documentation on plotting. # Suppose we want: # * meaningful labels, # Labels can be done using: plot(x,y,xlab="fertilizer",ylab="log(yield)", main="Crop Experiment One",sub="Corn in Hadley Field 031221a") # Next, suppse we want to add more points to this plot # We do solve by using the "points()" function. # Note that "points()" allows us to add points # to an existing. # Here is an example of points: points(0.1,0.5,pch="M",cex=2) # It sticks a point at position (0.1,0.5), character # being "M", and cex=2 doubles the size. # here's another: plot(c(1,20),c(1,20),type="n") for (i in 1:20) { points(i,i,pch=i,cex=2) #pch = "point type" } # Note that type="n" is an option of plot() that says # do the plot with the frame, axes, etc, but leave # out the actual points etc. # and another plot(c(0,1),c(0,1),type="n") for (i in 1:10) { x.cord <- runif(1) y.cord <- runif(1) points(x.cord,y.cord,col=i,cex=2,pch=16) # col for "color" text(x.cord,y.cord-.05,paste("col = ",i)) } # Note that all of the options e.g. pch and xlab, # and documented in the par, but since they # are local (i.e. specific to just one plot) # then R allows us to use them as arguments # of plot(). # Here is an example of global par() assignment. par(mfrow=c(3,3)) # This says divied our plotting screen # into a 3 by 3 grid for (i in 1:9) { plot(x,y,xlab="fertilizer",ylab="log(yield)", main=paste("Crop Experiment ",i),sub="Corn in Hadley Field 031221a", type="n") points(x,y,pch=ceiling(i/3) ) } # Suppose that we wanted all the frames to be square, # rather than rectangular. Then we could use: par(mfrow=c(3,3),pty="s") for (i in 1:9) { plot(x,y,xlab="fertilizer",ylab="log(yield)", main=paste("Crop Experiment ",i),sub="Corn in Hadley Field 031221a", type="n") points(x,y,pch=ceiling(i/3) ) } # Whenever the plot() function is executed then a new # plot is started. But sometimes we just want to add # stuff to an existing plot. This can be done using # functions such as: # lines() # points() # text() # abline() # legend() # symbols() # polygon() # Let's illustate some of these by adding the least # squares regression line to the plot. This is obtained # using: par(mfrow=c(1,1)) plot(x,y) lsreg <- lm(y~x) lsreg$coef summary(lsreg) # ( feel free to look at lsreg and summary(lsreg) ) # One way to add this to the plot is as follows: # 1. set up a grid of x values: xg <- seq(min(x),max(x),length=25) # 2. Obtain the corresponding y values for the least squares line: yg <- lsreg$coef[1] + lsreg$coef[2]*xg # 3. Now use the lines() funtion to add this to the plot: lines(xg,yg) # To get a thicker line type lines(xg,yg,lwd=4) # A faster way of adding a line to a plot is to use # abline(). plot(x,y) #reset the plot # e.g. abline(lsreg$coef[1],lsreg$coef[2]) will add a line of intercept = lsreg$coef[1] # and slope = lsreg$coef[2] to the plot. # To add a line that is # 1. dashed, and 2. very thick then # use: abline(lsreg$coef[1],lsreg$coef[2],lty=4,lwd=5) # An example of text() is: text(.2,1.5,"least squares line") # This centres the string "least squares line" at position: 0.2,1.5 # To position text manually try something like: text(locator(1),"thick dashed line") # R then waits for you to click the left mouse button # at the position you desire. # Here is an example of the use of polygon(). This # is to draw a pentagon. Note that x.vert and y.vert # contain the x and y values of the vertices. x.vert <- c(60,90,105,75,45) y.vert <- c(2,2,3.5,4.2,3.5) plot(x.vert,y.vert,type="n") polygon(x.vert,y.vert) # Consider a data set with several variables such as # the "cheese" data (cheese.dat): # download: http://www.math.umass.edu/~jstauden/cheese.dat cheese <- read.table("c:/learnR/cheese.dat") # look at the data cheese # A good first plot to try is: pairs(cheese) # which does all pairwise scatterplots. # If there are 3 variables that want to # be compared then there are several # ways of doing this: cheese.reduced <- cheese[,-4] #takes out 4th column # Saving plots # ------------ # When plot window is selected, see menu item: File: Export Graph... # Surface plots # ------------- # R has 3 functions for plotting surfaces: # contour(), persp() and image(). # Setting up the plotting information for a function # like contour() requires some thought. We need to # set up a "mesh" on the plane, and then obtain # function values over that mesh. # e.g. suppose that we wanted to plot the function: # f(x1,x2) = -(x1-0.5)^2 - (x2-0.3)^2 + 0.3*(x1-0.5)*(x2-0.3) # over the unit square. # First we set up a grid in each direction. x1g <- seq(0,1,by=0.1) x2g <- seq(0,1,by=0.1) # Now set up the matrix of function values. mesh <- expand.grid(x1g,x2g) x1m <- mesh[,1] x2m <- mesh[,2] mesh.size <- length(x1g) fun.vals <- -(x1m-0.5)^4 - sin(2*(x2m - 0.3)) fun.vals <- fun.vals + 0.3*(x1m-0.5)*(x2m-0.3)^2 fun.vals <- matrix(fun.vals,mesh.size,mesh.size) # Now we are ready to draw our surface. contour(x1g,x2g,fun.vals) # Another way to look at this surface # is by typing: persp(x1g,x2g,fun.vals) # A final way is through an image plot: image(x1g,x2g,fun.vals) # To get an image plot with finer resolution # use, say, x1g <- seq(0,1,length=64) x2g <- seq(0,1,length=64) mesh <- expand.grid(x1g,x2g) x1m <- mesh[,1] x2m <- mesh[,2] mesh.size <- length(x1g) fun.vals <- -(x1m-0.5)^4 - 5*sin(5*(x2m - 0.3)) fun.vals <- fun.vals + 0.3*(x1m-0.5)*(x2m-0.3)^2 fun.vals <- matrix(fun.vals,mesh.size,mesh.size) image(x1g,x2g,fun.vals) # Another useful way of studying 3 variables is through a coplot, # e.g. coplot(taste~H2S|lactic,data=cheese) # This uses R's "formula language". # This is interpreted as: # show me "taste" versus "H2S", conditional on "lactic". # The latest versions of R use the # function trellis() for doing plots # of this type... # 7. DATA MANIPULATION # Note that aa[-2] refers to all entries of aa except # for the 2nd one. # Also, aa[-c(2,4)] refers to all entries of aa except # for the second and fourths ones. # rep() is used to repeat arrays, rep(c(2,3,6),4) # Another useful one is rev(): rev(c(4,6,1,3,0,9)) # Extracting with respect to a condition # --------------------------------------- # Suppose that we have an array: aa <- c(4,2,7,8,4,5,3,9,1,4,6,9) # We want that part of the array corresponding # to entries that are greater than 5. This # can be obtained by typing: aa[aa>5] # Another example is: aa[(aa>2)&(aa<=8)] # A further example, corresponding to those # entries of aa that are either greater than # 2 or less than or equal to 8 is: aa[(aa>2)|(aa<=8)] # Now supppose we the want the indices of aa # corresponding to entries that are greater than 5. # This can be done by typing: (1:12)[aa>5] # It is also useful to be able to find # those locations where there are missing # data. Suppose that x <- c(3,5,1,2,NA,4,1,NA,9,10) inds.NA <- (1:length(x))[is.na(x)==T] x[inds.NA] x[-inds.NA] # Suppose we have a list and want to apply # a function to each element of the list. # lapply is a function to do this. # For instance: new.list <- list(runif(2),runif(5),runif(10)) new.list lapply(new.list,max) new.list <- list("misha","rick","howard","matt") lapply(new.list,nchar) # A useful companion function to lapply() is unlist() # An illustration of unlist() is: aaa <- lapply(new.list,nchar) aaa unlist(aaa) # The last command gives an array. # more examples log.nchar <- function(string) { return(log(nchar(string))) } lapply(new.list,log.nchar) other.list <- list("dog",4,c(3,4,58),matrix(c(1,4,2,9),2,2)) lapply(other.list,is.numeric) # Matrix operations # ----------------- # A # [,1] [,2] # [1,] 2 9 # [2,] 4 7 # # B # [,1] [,2] [,3] # [1,] 5 1 5 # [2,] 6 2 6 # # The product of A and B is: # # A %*% B # [,1] [,2] [,3] # [1,] 64 20 64 # [2,] 62 18 62 # # # However, # B %*% A # Error in "%*%.default"(B, A): Number of columns of x should be the same as # number of rows of y # Dumped # # # The inverse of a square matrix is found using: # # # > solve(A) # [,1] [,2] # [1,] -0.3181818 0.40909091 # [2,] 0.1818182 -0.09090909 # # # To solve the system of equations: # # 2x + 9y = 3 # 4x + 7y = 7 # # we type: # # solve(A,c(3,7)) # # # [1] 1.90909091 -0.09090909 # # This means that # # x=1.90909091, y= -0.09090909 # # are the solutions. # The eigenvalues and eigenvectors of A can # be found using # # eigen(A) # # $values: # [1] 11 -2 # # $vectors: # [,1] [,2] # [1,] -0.7576044 -0.9138115 # [2,] -0.7576044 0.4061385 # # The determinant of A is the product of # its eigenvalues, so we can find this # by typing: # # prod(eigen(A)$values) # # [1] -22 # # # Sorting with respect to a column of a matrix # -------------------------------------------- # #One way to sort the cheese data #with respect to taste is: order.taste <- order(cheese$taste) new.cheese <- cheese[order.taste,] # One way to get the dimnames correct is to use: dimnames(new.cheese)[[1]] <- as.character(sort(as.numeric(dimnames(new.cheese)[[1]]))) # Suppose we want the sub-matrix of cheese correponding # to values of H2S<5. # # # First get those indices (between 1 and 30) for # which H2S<5. We can do this using: # wanted.row.indices <- (1:nrow(cheese))[cheese$H2S<5] wanted.cheese <- cheese[wanted.row.indices,]