#############################################################################################

R can be used simply as a calculator

2 * 3 + 5 ^ 2 1:3 (1:3 + 5) / 2 1:3 + 4:6

We can use R to assign values to variables and to use the variables for more complex calculations

Calculate the resulting population after 30 years of constant growth at 2% per year

inipop <- 800 g <- 0.02 time <- 30

endpop <- inipop * (1+g)^time endpop

ls() # once you have created some object, you can check what you have in your workspace rm('endpop') # removes certain objects rm(list=ls()) #removes all objects in the current workspace

#############################################################################################

Using functions in R

vector1 <- c(1,2,3,4,5,6,7,8,9) # c() concatenates these numbers to a vector plot(vector1) mean(vector1) # mean() computes the mean of the input vector ?mean example(mean) # shows examples of what can be done with a function help.search('mean')

#############################################################################################

Programming your own function

my.mean <- sum(vector1) / length(vector1) my.fun <- function(x){sum(x)/length(x)} my.fun(vector1)

vector2 <- c(2,4,6,8,10,12) # functions are universally applicable my.fun(vector2)

to return to our initial example

projfun <- function(inipop,g,time){inipop * (1+g)^time} projfun(100,0.02,35) projfun(50,0.01,10)

#############################################################################################

To load further functions not included in the base-distributions, you need to install and load additional packages

install.packages(c('reshape','plyr'))

library(reshape) # this loads the library containing the functions you are interested in require(plyr) # basically the same as library(), however, this one does not report errors

#############################################################################################

Different data types in R

#############################################################################################

-- numeric data --

vector1 mode(vector1) #

vector1 + 4 # basic arithmetic operations can be applied to numeric data ... vector1 * 4 vector1 / 4

vector1sq <- vector1 ^ 2 # square vector1 sqrt(vector1sq) # take square root exp(vector1) # exponential function log(vector1) # log function with Euler base as default #############################################################################################

-- character data --

"asdf123.-4" # everything that's between "" or '' is a called a 'string'; character data is made up of such strings c("asdf123.-4","bdl0320d;./") # we can create vectors of strings educ <- c("Prim", "Ter", "Sec", "Sec", "Ter", "Prim") # we can assign such vectors into objects mode(educ)

It is not possible to calculate with character data, but one can do other things

educ + 4 toupper(educ) # capitalizes all letters tolower('ERICH') substr(educ,start=1,stop=3) # return sub-string ranging from position 1 to 3 substr(educ,1,1) substr(educ,1,1) <- 'X' # can also be used for replacing parts of strings educ

paste(c('a','b','c'),1:3,sep='_') # concatenates vectors after creating character strings paste(c('a','b','c'),1:3,sep='-',collapse=', ') # collapses the vector into one string using the desired separator

agegr <- paste(c(0,16,65),c(15,64,99),sep='-') agegr paste(agegr,collapse=', ') #############################################################################################

-- logical data --

my_logical <- vector1 > 5 # creates a logical vector mode(my_logical)

my_logical + 4 # in arithmetic operations, logical values will be interpreted as 0 or 1 c(1,2,3) * my_logical

logical operators

vector1 < 4 vector1 <= 4 vector1 > 4 vector1 >= 4 vector1 == 4 vector1 != 4 (vector1 > 3) & (vector1 < 8) # which ones are greater than 3 AND smaller than 8 (vector1 < 3) | (vector1 > 8) # which ones are smaller than 3 OR greater than 8

is.element(4,vector1) # Is 4 an element of vector1? 4 %in% vector1 # Same as above c(4,5,15) %in% vector1 # Which one of 4, 5, and 15 can be found in vector1? !c(4,5,15) %in% vector1 # Which of these numbers can NOT be found in vector1? 'b' %in% c('a','a','a','b','a') #############################################################################################

-- factorial data --

sex <- c("male","male","female","male","female") # this is a character object sex <- factor(sex) # this transforms it to a factor levels(sex)

my_factor <- factor(educ) # transforming our character object to a factor my_factor str(my_factor) # this is now a factor with three levels levels(my_factor) # the levels can be accessed separately #############################################################################################

-- missing data --

missings <- c(1,3,NA,2,4,NA,9) is.na(missings) # Which values of this vector are missing values? mean(missings) # many functions have an option to deal with missing values mean(missings,na.rm=T) # should not available numbers be removed?

placeholder <- NULL placeholder[1] <- 'Austria' placeholder[2] <- 'Sweden' placeholder[4] <- 'Russia' placeholder

other ways of getting either missing or infinite values

0/0 1/0 -1/0 #############################################################################################

R objects

#############################################################################################

-- vectors --

vector3 <- c(vector1,vector2) # c() can also be used to 'concatenate' two vectors vector3

vector4 <- c(vector2,vector1) # the order of the vectors inside c() matters vector4

1:100 # creates a sequence from 1 to 9 seq(from=1,to=100,by=1) seq(1,100,1) # if you don't name your function arguments, make sure they are in the right place ?seq seq(1,100,20) seq(1,100,length.out=2)

rep(2,times=5) # creates a vector that reproduces 2 five times rep(seq(1,4),times=2) # reproduces the sequence twice rep(seq(1,4),each=2) # reproduces each element of the sequence twice

#############################################################################################

-- matrices --

x <- c(1:8) dim(x) <- c(2,4) # assigns a certain dimension to a vector and turns it into a matrix x dim(x)

matrix(data=1:15,nrow=3,ncol=5) # creates a matrix with the elements ranging from 1 to 15 ordered in three rows and five columns matrix(data=1:15,nrow=5,ncol=3) # creates a matrix with the elements ranging from 1 to 15 ordered in five rows and three columns matrix(data=1:15,nrow=5,ncol=3,byrow=T) # fills the matrix by row, not by column (default) matrix(1,nrow=5,ncol=3) # creates a matrix with the elements ranging from 1 to 15 ordered in five rows and three columns

diag(1,5,5)

matrix1 <- cbind(vector3,vector4) # column bind ... creates a matrix with two columns and preserves the names matrix1 matrix1 <- cbind(x=vector3,y=vector4) # you can name the columns yourself matrix1 matrix2 <- rbind(vector3,vector4) # row bind ... creates a matrix with two rows matrix2

matrix2 c(1,2) # we can multiply a matrix with a vector matrix1 c(1,2) # first column by first element of the vector, second by second ...

matrix2 * c(1,2,3) # sometimes the result doesn't make much sense: R 'recycles' the vector

for actual matrix multiplication, use %*%

matrix1 %% c(1,2) # 15x2 2x1 = 15x1

m1 <- matrix(1,2,3) m2 <- matrix(1:6,2,3) m1 %% m2 # non-conformable m1 %% t(m2) # 2x3 * 3x2 = 2x2

matrix(1:6,2,3) %% matrix(1:12,3,4) # 2x3 3x4 = 2x4

#############################################################################################

-- data frames --

a data frame can contain both numeric and character variables

country1 <- c("USA", "Japan", "Sweden", "France") e0 <- c(80.69, 86.04, 83.12, 84.36) DataMort <- data.frame(country=country1, lifexp=e0) DataMort str(DataMort)

as.data.frame(matrix1) # a data frame can be created from a matrix

data.frame(vector3,vector4) # creates a data frame with two variables: vector1 and vector2

df1 <- data.frame(vector3, vector4, seq(from=1,to=length(vector4)), seq(1,50,length.out=length(vector4))) df1

df1 <- data.frame(x1=vector3, x2=vector4, x3=seq(from=1,to=length(vector4)), x4=seq(1,50,length.out=length(vector4))) # pass variable names on to the data frame names(df1)

names(df1) <- c('x1','x2','x3','x4') # names of data frames can also be assigned later on

df1$country <- rep(c('Mexico','Sweden','Japan'),each=5) # alternatively, variables can be added using '$' df1[,'country'] #show only the column with the country names unique(df1[,'country'])

?expand.grid # creates a data frame from all combinations of the supplied input vectors expand.grid(x=c(1,3,5),y=c('low','middle','high'))

merging two data frames

capitals <- c('Washington','Tokio','Stockholm','Paris') m0 <- c(0.0062, 0.0025, 0.0025, 0.0034) DataMort2 <- data.frame(country=country1, capital=capitals, IMR=m0) DataMort2

DataMort3 <- merge(DataMort,DataMort2) # combines two data frames based on their joint columns DataMort3

#############################################################################################

-- lists --

vector1 matrix2 df1 data_source <- 'Erich`s fantasy'

mylist <- list(my_vector=vector1, my_matrix=matrix2, my_dataframe=df1, source=data_source) mylist #############################################################################################

Exercises

#############################################################################################

Indexing in R

#############################################################################################

-- indexing a vector with [] --

vector1[2] # indexing a vector to show only its second element

vector1[3:6] # give me elements 3 to 6

vector1[vector1 < 5] # logicals can be used for indexing as well - only values of vector1 that are smaller than 5 my_indx <- vector1 < 5 # create a separate index object vector1[my_indx]

vector1[12] # does my vector even have that many elements? length(vector1)

-- indexing a matrix with [] --

matrix1[2,3] # failed attempt to index a matrix (only has two columns!) dim(matrix1) # dimension of matrix1 nrow(matrix1) # number of rows of matrix1 ncol(matrix1) # number of columns of matrix1

matrix1[3,2] # third row element of the second column of matrix1 matrix1[c(3,5,9),2] # third, fifth, and ninth row element of the second column of matrix1

indx <- 3:9 # create separate index to use inside matrix in next line matrix1[indx,2] # third to the ninth row element of the second column of matrix1 matrix1[,2] # specifying just one of row- or column-index gives the entire column or row

-- indexing with names

e0 # vector with life expectancies at birth names(e0) names(e0) <- country1 e0['Japan']

matrix1 colnames(matrix1) rownames(matrix1) rownames(matrix1) <- 1970:1984

matrix1['1980','y'] # we can use row and column names (if available) for indexing matrix1[,'y']

df1 df1[,'country'] #indexing data frames

-- some useful functions for indexing/extracting data --

attach(df1) # makes the variables within df1 globally available x1 mean(x1) x4 <- x1 + x2 + x3 5 detach(df1) # to avoid confusion, the data frame needs to be detached again x1 # now no longer available in the global environment x4 # still available because it was created globally df1$x4 <- x4 # '$' can also be used to add further variables to data frames df1 x2 attach(df1) with(df1,x1 + x4 - x2 3 + x1) # does what it says 'with' some object with(df1,table(country)) with(df1,table(country,x4)) with(df1,paste(country,x1,sep='')) df1$x5 <- with(df1,paste(country,x1,sep='')) df1

df2 <- transform(df1,x6 = x4 ^ 2 - sqrt(x3)) # allows you to add variables to a data frame transforming its other variables df2

df2 <- within(df2,{x7 <- x1 + 2 - x3 * 4 + x6 # alternative to transform(), but a bit more complicated. Evaluates an expression 'within' an object x8 <- paste(country,x4,sep='_') x9 <- mean(x1)}) df2 df3 <- transform(df2,x10=x9-1) df3

with(df1,x1my.mean) with(df1,x4my.mean) with(df3,mean(x4)) mean(df3[,'x4']) with(df3,table(country)) with(df3,table(country,x4)) table(df3$country, df3$x4)

within(df3, { x11<- x1+1.5 x4*2 x12<- mean(x1) x13<- paste(country,rep(1:5,times=3)x1,sep'-') })

subset(x=df3,country='Sweden')

DataMort3 subset(DataMort3,lifexp > 84) # pick only those countries where life-expectancy is bigger than 84 years subset(DataMort3,IMR < 0.003) # countries where infant mortality is smaller than 0.003 subset(DataMort3,IMR < 0.003 & lifexp > 84) # both selection criteria combined subset(DataMort3,IMR < 0.003,select=c(country,IMR)) # keep only variables 'country' and 'IMR' subset(DataMort3,IMR < 0.003,select=-c(capital)) subset(DataMort3,IMR < 0.003,select=-c(capital,lifexp)) # drop variables 'capital' and 'lifexp' subset(DataMort3,country %in% 'Japan') subset(DataMort3,country %in% c('Sweden','Japan')) # checks the variable country one by one if its value is in this vector

subset(DataMort3,country %in% c('Sweden','Japan') & lifexp<85) mylist str(mylist) mylist$my_matrix mylist$source mylist[[2]][1,3] mylist[[3]]

vector1 min(vector1) min(matrix1) range(matrix1)

-- indexing data frames and lists with '$' --

df1$x2 # '$' gives you one column of a data frame df1$x1 + df1$x2

lm(x5x1+x2+country,data=df1)

mylist str(mylist) # seeing the list structure helps indexing it mylist$my_dataframe # lists can be indexed with a '$' sign mylist[[3]] # or using double square brackets [] mylist$my_dataframe$country mylist[[3]]$country mylist[[3]][,5]

#############################################################################################

Basic Functions in R

#############################################################################################

length(vector1) #returns the length of a vector nrow(matrix1) #returns number of rows of a matrix or data frame ncol(matrix1) #returns number of columns of a matrix or data frame dim(matrix1) #returns number of dimensionality of a matrix or data frame

min(vector1) #returns the minimum of an object max(vector1) #returns the maximum of an object range(vector1) #returns the minimum and the maximum of an object range(matrix1,na.rm=T) vector9 <- c(1,4,12,3,NA,5,NA,8)

vector9 max(vector9) max(vector9,na.rm=T) summary(matrix1) summary(df1)

summary(vector1) # generic function = output depends on input object's type quantile(vector4) # this is just the same as empirical quantiles from quantile() quantile(vector4,probs=c(0,0.5,1)) # you can also set the quantiles yourself summary(matrix1) quantile(vector4,probs=c(0,0.5,1)) quantile(vector4,probs=c(0,0.2,1))

which.min(vector4) # returns the position of the minimum within a vector which.max(vector4) # returns the position of the maximum within a vector vector4[which.max(vector4)] df3 which.max(df3$x4) wm <- with(df3,which.max(x4)) df3[wm,] grep(7,1:9) # gives the position of a certain value within a vector grep('b',c('a','b','c')) # also works with character data grep('b',c('a','b','b','c')) # it returns all the occurences grep('b',c('a','b','ba','c')) # also finds strings that contain a specific pattern as an element - do we want that? match('b',c('a','b','ba','c')) # If you want the exact pattern, use match() grep('b',c('a','b','ba','c'))

vector5 <- seq(-0.5,0.5,0.08) vector5 vector14 <- seq(.5,1,.008) vector14 round(vector14,digits=1) ceiling(vector1410) floor(vector1410) ?floor

abs(vector5) #returns the absolute value(s) round(vector5,digits=3) #rounds to a given number of digits round(vector5,digits=1) #rounds to a given number of digits floor(vector510) #returns the next smaller whole number ceiling(vector510) #returns the next bigger whole number with(df3,cov(x1,x3)) x1 x3 vector1 rev(vector1) vector3 sort(vector3)

sort(vector3, decreasing=T) index2 <-order(df1$x3) df3[index2,]

install.packages('plyr') library(Rcpp) library(plyr) index2 <-order(df1$x2) df1[index2,] arrange(df1,x2) arrange(df1,-x2)

with(df3,cor(x1,x3)) sum(vector1) #sums up all elements of vector1 rowSums(matrix1) # computes the sum of each row within a matrix or data frame colSums(matrix1) # computes the sum of each column within a matrix or data frame

ptab <- prop.table(matrix1, margin=NULL) # computes a table of proportions (margins can be defined) sum(ptab) ptab1 <- prop.table(matrix1, margin=1) # proportion within row rowSums(ptab1) # rows sum up to 1 ptab2 <- prop.table(matrix1, margin=2) # proportion within column colSums(ptab2) # columns sum up to 1

mean(vector1) # computes the mean of vector1 sd(vector1) # computes the standard deviation var(vector1) # computes the variance

cov(df1$x2,df1$x4) # computes the covariance between two variables cor(df1$x2,df1$x4) # computes the correlation between two variables

#############################################################################################

Sorting data in R

#############################################################################################

vector1 # is already sorted, but what if we want to revert the order? rev(vector1) sort(vector1,decreasing=T)

sort(vector4) # by default, sort() orders in ascending order sort(vector4,decreasing=T)

df1 df1[order(df1$x2),]

install.packages('plyr')

library(plyr) arrange(df1,x2) # sorts the data frame by column x2 smallest to largest arrange(df1,-x2) # same thing but largest to smallest

#############################################################################################

Exercises day2

vector15<- seq(20,1000,length=15) vector15 min(vector15) min(vector15) range(vector15) matrix20 <- matrix(data=1:200,nrow=20,ncol=10) matrix20 rowsums(matrix20)

#############################################################################################

Reading data into R

#############################################################################################

install.packages('foreign') # for reading in data formats from other software

library(foreign) # have a look at the package help file to see which formats are available

getwd() # asks for the current working directory setwd('C:\Dropbox\ESKC\Bangalore\Data') # sets working directory

pop_india <- read.table(file='wicpop.csv',sep=',',skip=8,header=T,na.strings=NA) str(pop_india) head(pop_india) tail(pop_india)

pop_india2 <- read.csv(file='wicpop.csv',skip=8) head(pop_india2)

identical(pop_india2,pop_india) # logical function to test if two objects are identical rm(pop_india2)

-- Writing/Saving Data in R --

write.table(x=pop_india,file='pop_india.txt',sep=',') # writes a table into a comma-separated text file pop_india.txt <- read.table(file='pop_india.txt',header=T,sep=',') # reads a text file head(pop_india.txt)

save(pop_india,file='pop_india.rda') rm(pop_india) # removes the data frame pop_india # this should give an error now load(file='pop_india.rda') head(pop_india)

save(mylist,file='mylist.rda') # also saves entire lists and other complex structures rm(mylist) load('mylist.rda') mylist

ls() # shows you the objects currently in the workspace save.image('my first steps in R.Rdata') # saves the entire workspace rm(list=ls());gc() # cleaning up the workspace and garbage collection (gc) load('my first steps in R.Rdata') # retrieve previously saved workspace ls() #############################################################################################

Exercises

#############################################################################################

Simple graphs in R

#############################################################################################

?plot ?par par() # shows you all the high-level parameters that can be changed par('mfrow') par(mfrow=c(2,2)) # changes the number of plots in one graphic device par('mfrow')

z <- seq(-3,3,0.1) d <- dnorm(z) # evaluates the density function for each value in z ?dnorm plot(d) plot(x=z,y=d) plot(z,d,type='l') # high-level plot function producing the structure of the plot title('Normal Distribution') # low-level function adding a title to an existing graphics window abline(v=0,lty=2,col='red') # adds a vertical 'red' line at 0, defines the line-type as 'dotted' abline(v=0,lty=2,col=1) # colors can also be defined by numbers abline(v=0,lty=2,col=2) abline(v=0,lty=2,col=8) abline(v=0,lty=2,col=9) # returns to 'black'

colors() # gives you an overview of all the available colors (or just google "R colors")

dev.off() # shuts down the currently active graphical device graphics.off() # shuts down all graphical devices opened in this session

-- Let's plot some real data --

asfr <- read.csv('wicasfr.csv',skip=8,na.strings=NA,header=T) fertages <- paste(seq(15,45,5),seq(19,49,5),sep='--') asfr_total <- subset(asfr,age %in% fertages,select=area:total)

with(asfr_total,plot(total[year=='2095-2100'],type='b',lwd=2,xlab='Age',ylab='ASFR',xaxt='n')) # plotting the asfr for 2095-2100 with(asfr_total,axis(side=1,labels=fertages,at=1:length(fertages))) # adding a self-defined axis with(asfr_total,plot(total[year=='2010-2015'],type='b',lwd=2,xlab='Age',ylab='ASFR',xaxt='n')) with(asfr_total,axis(side=1,labels=fertages,at=1:length(fertages)))

with(asfr_total,lines(total[year=='2095-2100'],type='b',lty=2,col=2)) # adding lines to an existing plot with(asfr_total,lines(total[year=='2055-2060'],type='b',lty=3,col=3))

legend(x='topright',inset=c(0.04,0.02),legend=paste(c(2010,2055,2095),c(2015,2060,2100),sep='-'),lty=c(1,3,2),col=c(1,3,2),lwd=c(2,1,1),bg='grey') grid(nx=NULL,ny=NULL,col='lightgrey') # plots over your nice legend - order matters! legend(x='topright',inset=c(0.04,0.02),legend=paste(c(2010,2055,2095),c(2015,2060,2100),sep='-'),lty=c(1,3,2),col=c(1,3,2),lwd=c(2,1,1),)

#############################################################################################

Exercise on line graph

-- matrix-plotting with matplot(), matlines() and matpoints() --

head(asfr) asfrbyedu2010 <- subset(asfr,year=='2010-2015' & age %in% fertages,select=no.education:post.secondary) matplot(asfrbyedu2010) matplot(asfrbyedu2010,type='l',col=1:6,lty=1,lwd=2,xlab='age',ylab='asfr',xaxt='n') axis(1,1:7,fertages)

par(mfrow=c(2,2)) matplot(asfrbyedu2010,type='l',col=1:6,lty=1,lwd=2,xlab='age',ylab='asfr',xaxt='n') axis(1,1:7,fertages)

asfrbyedu2100 <- subset(asfr,year=='2095-2100' & age %in% fertages,select=no.education:post.secondary) # y-axis values are not the same matplot(asfrbyedu2100,type='l',col=1:6,lty=1,lwd=2,xlab='age',ylab='asfr',xaxt='n') axis(1,1:7,fertages)

matplot(asfrbyedu2010,type='l',col=1:6,lty=1,lwd=2,xlab='age',ylab='asfr',xaxt='n') matlines(asfrbyedu2100,col=1:6,lty=2) # adds multiple lines to an existing plot legend(legend=names(asfrbyedu2100),lty=1,lwd=2,col=1:6,'topright',inset=0.02,title='Education Group',cex=0.6)

-- scattplots with plot() --

pwt <- read.table('pwt7.1.txt',sep=',',na.strings='na',header=T) # reading in Penn World Table data head(pwt) str(pwt)

pwt2010 <- subset(pwt,year==2010 & !is.na(rgdpch)) # we only want to plot results for those countries that report GDP per capita

par(mfrow=c(2,2)) with(pwt2010,plot(x=rgdpch,y=openc)) # with the data frame pwt2010, plot 'rgdpch' against 'openc' with(pwt2010,plot(x=rgdpch,y=openc,xlab='Real GDP per capita',ylab='Openness',main="I'm so proud of my first graph")) # adding x- and y-axis labels

gdp_range <- c(0,50000) open_range <- c(0,200)

with(pwt2010,plot(rgdpch,openc,xlab='Real GDP per capita',ylab='Openness',main="I'm so proud of my first graph", xlim=gdp_range,ylim=open_range)) # specifying the range of the x and y axis

with(pwt2010,plot(rgdpch,openc,xlab='Real GDP per capita (in 1000s)',ylab='Openness',main="...although it is meaningless", xlim=gdp_range,ylim=open_range,xaxt='n', # remove x-axis pch=19,cex=0.6)) # change the points symbols - for details google "pch symbols R"

xat <- seq(0,50000,10000) xlab <- xat / 1000 # we don't like huge numbers, that's why we plot the data in 1000s axis(side=1,at=xat,labels=xlab) # add your own x-axis grid(nx=NULL,ny=NULL,col='lightgrey') # by not specifying the number of grid-lines, R automatically alignes them with the tick marks

with(pwt2010,text(x=rgdpch,y=openc,labels=country.isocode,cex=0.7,pos=3)) # adding point lables to plot

-- other high-level plotting functions --

hist(pwt2010$rgdpch) # produces a histogram with(pwt2010,hist(rgdpch,breaks=10))

with(pwt2010,boxplot(rgdpch))

with(pwt2010,barplot(rgdpch))

#############################################################################################

Exercises

#############################################################################################

Plotting on an external device

#############################################################################################

pdf(file="C:/Dropbox/ESKC/Bangalore/Output/Indian asfr.pdf",width=7,height=8,paper="USr",bg="transparent",family="serif")

with(pwt2010,plot(rgdpch,openc,xlab='Real GDP per capita (in 1000s)',ylab='Openness',main="... and now it's in PDF!!!", xlim=gdp_range,ylim=open_range,xaxt='n', # remove x-axis pch=19,cex=0.6)) # change the points symbols - for details google "pch symbols R"

with(pwt2010,text(x=rgdpch,y=openc,labels=country.isocode,cex=0.7,pos=3)) # adding point lables to plot

with(pwt2010,lines(lowess(x=rgdpch,y=openc),col=3,lty=2,lwd=2)) # add a locally-smoothed trendline to our previous graph

dev.off()

jpeg(filename = "C:/Dropbox/ESKC/Bangalore/Output/Indian asfr.jpg",width = 800, height = 900, units = "px", pointsize = 12, quality = 100,bg = "white",res = 140, family = "serif", restoreConsole = TRUE,type = c("cairo"))

with(pwt2010,plot(rgdpch,openc,xlab='Real GDP per capita (in 1000s)',ylab='Openness',main="...jpeg", xlim=gdp_range,ylim=open_range,xaxt='n', # remove x-axis pch=19,cex=0.6)) # change the points symbols - for details google "pch symbols R"

with(pwt2010,text(x=rgdpch,y=openc,labels=country.isocode,cex=0.7,pos=3)) # adding point lables to plot

dev.off()

#############################################################################################

Linear regression in R

#############################################################################################

?lm lm1 <- lm(rgdpch~openc,data=pwt2010) # fits a standard linear regression with implicit intercept term

lm1 <- lm(rgdpch~openc-1,data=pwt2010) # subtracting 1 in the formula removes the intercept summary(lm1)

attach(pwt2010) lm(rgdpch~openc) # if the data frame is attached, you don't need the 'data' argument of lm() detach(pwt2010)

lm1 # lm1 is a fitted model object str(lm1) # lm1 includes loads of 'hidden' information names(lm1)

coefficients(lm1) residuals(lm1) # extract the residuals from your regression result fitted.values(lm1) # fitted values. For linear models, predict(lm1) yields the same fitted(lm1) coef(lm1)

deviance(lm1) # returns the residual sum of square formula(lm1) # shows you just the formula that you estimated

summary(lm1) # shows a summary of your regression result

plot(lm1) # by default shows four diagnostic plots for the regression par(mfrow=c(2,2)) plot(lm1) # by default shows four diagnostic plots for the regression

with(pwt2010,abline(lm(openc ~ rgdpch),col=2,lty=3,lwd=2)) # adds a trendline to the previous scatterplot

lm2 <- lm(rgdpch~openc+POP+cc+cg+ci,data=pwt2010) # multivariate linear regression summary(lm2)

lm3 <- update(lm2,.~.-POP-ci) # adds or removes regression components from an existing fitted object summary(lm3)

anova(lm3) # compares the model with a basic model with only an intercept term

lm0 <- lm(rgdpch~1,data=pwt2010) # fits a regression with with only an intercept term summary(lm0)

anova(lm0,lm3) # the same as anova(lm3) above anova(lm1,lm3) # our more sophisticated model outperforms the plotted model

-- model selection --

AIC(lm1,lm3) BIC(lm1,lm3)

drop1(lm3) # goes from general to specific and tells you not to drop any of the variables add1(lm0,scope= ~ openc + I(openc^2) + ci + cg) # adds one at a time from the given scope

?nlm # non-linear models ?glm # general linear models, i.e. binomial, logistic, poisson regression ... ?lowess # local approximation model with(pwt2010,lowess(x=rgdpch,y=openc))

with(pwt2010,lines(lowess(x=rgdpch,y=openc),col=3,lty=2,lwd=2)) # add a locally-smoothed trendline to our previous graph

#############################################################################################

Exercise

#############################################################################################

Programming Tools

#############################################################################################

-- conditionals --

life.expectancy <- 70 if (life.expectancy == 70) print('Life expectancy has reached 70 years') # if ( condition is TRUE ) do_this

life.expectancy <- 68 if (life.expectancy == 70) print('Life expectancy has reached 70 years')

if (life.expectancy >= 70) print(paste('Life expectancy has reached ',life.expectancy,' years',sep='')) if (life.expectancy < 70) print(paste('Life expectancy has reached ',life.expectancy,' years',sep=''))

conditional statements can be long and complex but they then require {}

le.lag <- NULL # initialization if (life.expectancy < 70){ print(paste('Life expectancy has reached ',life.expectancy,' years',sep='')) le.lag <- round(life.expectancy,-1) - life.expectancy # How many years missing to 70? } le.lag

if statement can be complemented with 'else' in case condition is not met

life.expectancy <- 78 if (life.expectancy < 70){ # conditional statements can be long and complex but they then require {} print(paste('Life expectancy has reached ',life.expectancy,' years',sep='')) le.lag <- round(life.expectancy,-1) - life.expectancy # How many years missing to 70? } else {le.lag <- 'life expectancy is greater or equal to 70'} le.lag

-- repetitive execution --

countries <- c('DK','IT','FR') for (i in countries) print(i) # for each i (the counter) in sequence of characters do something for (i in 1:3) print(countries[i]) # for each i in sequence of numeric values do something

the statement to be executed repetitively can be more complex and spread over several lines

for (i in countries){ print(i) idat <- subset(dat,country==i) print(with(idat,table(sex))) }

with(dat,table(country,sex))

for-loops and if-statements can be combined

for (i in seq(2,8,2)){ if (i < 5){ print(i) print('this number is too small') } else { print(i) print('from here on we round up')} }

for (i in seq(2,8,2)){ if (i < 5){ some_statement <- 'apples' } else { some_statement <- 'oranges' } print(some_statement) }

you can set an additional counter apart from the loop-counter

counter <- 0 for (i in seq(0.1,0.5,0.1)){ counter <- counter + 1 print(paste('The value of my counter is ',counter,sep='')) print(paste('... although the loop-counter is only ',i,sep='')) }

loops become particularly useful when modeling time-series

h <- seq(from=1, to=8) s <- NULL

s[1] <- h[1] 10 # you could do this one by one s[2] <- h[2] 10

for(i in 3:10){ # or run a loop over it s[i] <- h[i] * 10 }

this can be done with huge numbers of repetitions in no time

nr.of.experiments <- 500 means.for.randoms <- NULL for (i in 1:nr.of.experiments) { random.numbers <- rnorm(100) # create a 100 normally distributed random numbers means.for.randoms[i] <- mean(random.numbers) # save the mean of those random numbers } hist(means.for.randoms) # create a histogram of those random means #############################################################################################

Exercises

#############################################################################################

Programming your own functions

#############################################################################################

hello <- function(){print('Hello World')} # function without argument always does the same hello()

hello_somebody <- function(name){print(paste('Hello ',name,sep=''))} # function with argument hello_somebody(name='Samir') hello_somebody(name=c('Samir','James'))

functions you programmed yourself work exactly like pre-programmed functions

fun1 <- function(arg1, arg2){ # define the necessary inputs of your function w <- arg1 ^ 2 # define what your function is supposed to do for each input value return(arg2 + w) # define what your function is supposed to return as a result }

fun1(arg1 = 3, arg2 = 5) # this is where the function is actually executed fun1(arg1 = 1:9, arg2 = 5) # functions can also be applied to entire vectors

drp9drp / Learn

Learning R #1

R can be used simply as a calculator

We can use R to assign values to variables and to use the variables for more complex calculations

Calculate the resulting population after 30 years of constant growth at 2% per year

Using functions in R

Programming your own function

to return to our initial example

To load further functions not included in the base-distributions, you need to install and load additional packages

install.packages(c('reshape','plyr'))

Different data types in R

-- numeric data --

-- character data --

It is not possible to calculate with character data, but one can do other things

-- logical data --

logical operators

-- factorial data --

-- missing data --

other ways of getting either missing or infinite values

R objects

-- vectors --

-- matrices --

for actual matrix multiplication, use %*%

-- data frames --

a data frame can contain both numeric and character variables

names(df1) <- c('x1','x2','x3','x4') # names of data frames can also be assigned later on

merging two data frames

-- lists --

Exercises

Indexing in R

-- indexing a vector with [] --

-- indexing a matrix with [] --

-- indexing with names

-- some useful functions for indexing/extracting data --

-- indexing data frames and lists with '$' --

Basic Functions in R

Sorting data in R

install.packages('plyr')

Exercises day2

Reading data into R

install.packages('foreign') # for reading in data formats from other software

library(foreign) # have a look at the package help file to see which formats are available

-- Writing/Saving Data in R --

Exercises

Simple graphs in R

-- Let's plot some real data --

Exercise on line graph

-- matrix-plotting with matplot(), matlines() and matpoints() --

-- scattplots with plot() --

-- other high-level plotting functions --

Exercises

Plotting on an external device

Linear regression in R

-- model selection --

with(pwt2010,lines(lowess(x=rgdpch,y=openc),col=3,lty=2,lwd=2)) # add a locally-smoothed trendline to our previous graph

Exercise

Programming Tools

-- conditionals --

conditional statements can be long and complex but they then require {}

if statement can be complemented with 'else' in case condition is not met

-- repetitive execution --

the statement to be executed repetitively can be more complex and spread over several lines

for-loops and if-statements can be combined

you can set an additional counter apart from the loop-counter

loops become particularly useful when modeling time-series

this can be done with huge numbers of repetitions in no time

Exercises

Programming your own functions

functions you programmed yourself work exactly like pre-programmed functions