We can use R to assign values to variables and to use the variables for more complex calculations
Calculate the resulting population after 30 years of constant growth at 2% per year
inipop <- 800
g <- 0.02
time <- 30
endpop <- inipop * (1+g)^time
endpop
ls() # once you have created some object, you can check what you have in your workspace
rm('endpop') # removes certain objects
rm(list=ls()) #removes all objects in the current workspace
vector1 <- c(1,2,3,4,5,6,7,8,9) # c() concatenates these numbers to a vector
plot(vector1)
mean(vector1) # mean() computes the mean of the input vector
?mean
example(mean) # shows examples of what can be done with a function
help.search('mean')
To load further functions not included in the base-distributions, you need to install and load additional packages
install.packages(c('reshape','plyr'))
library(reshape) # this loads the library containing the functions you are interested in
require(plyr) # basically the same as library(), however, this one does not report errors
vector1 + 4 # basic arithmetic operations can be applied to numeric data ...
vector1 * 4
vector1 / 4
vector1sq <- vector1 ^ 2 # square vector1
sqrt(vector1sq) # take square root
exp(vector1) # exponential function
log(vector1) # log function with Euler base as default
#############################################################################################
-- character data --
"asdf123.-4" # everything that's between "" or '' is a called a 'string'; character data is made up of such strings
c("asdf123.-4","bdl0320d;./") # we can create vectors of strings
educ <- c("Prim", "Ter", "Sec", "Sec", "Ter", "Prim") # we can assign such vectors into objects
mode(educ)
It is not possible to calculate with character data, but one can do other things
educ + 4
toupper(educ) # capitalizes all letters
tolower('ERICH')
substr(educ,start=1,stop=3) # return sub-string ranging from position 1 to 3
substr(educ,1,1)
substr(educ,1,1) <- 'X' # can also be used for replacing parts of strings
educ
paste(c('a','b','c'),1:3,sep='_') # concatenates vectors after creating character strings
paste(c('a','b','c'),1:3,sep='-',collapse=', ') # collapses the vector into one string using the desired separator
my_logical + 4 # in arithmetic operations, logical values will be interpreted as 0 or 1
c(1,2,3) * my_logical
logical operators
vector1 < 4
vector1 <= 4
vector1 > 4
vector1 >= 4
vector1 == 4
vector1 != 4
(vector1 > 3) & (vector1 < 8) # which ones are greater than 3 AND smaller than 8
(vector1 < 3) | (vector1 > 8) # which ones are smaller than 3 OR greater than 8
is.element(4,vector1) # Is 4 an element of vector1?
4 %in% vector1 # Same as above
c(4,5,15) %in% vector1 # Which one of 4, 5, and 15 can be found in vector1?
!c(4,5,15) %in% vector1 # Which of these numbers can NOT be found in vector1?
'b' %in% c('a','a','a','b','a')
#############################################################################################
-- factorial data --
sex <- c("male","male","female","male","female") # this is a character object
sex <- factor(sex) # this transforms it to a factor
levels(sex)
my_factor <- factor(educ) # transforming our character object to a factor
my_factor
str(my_factor) # this is now a factor with three levels
levels(my_factor) # the levels can be accessed separately
#############################################################################################
-- missing data --
missings <- c(1,3,NA,2,4,NA,9)
is.na(missings) # Which values of this vector are missing values?
mean(missings) # many functions have an option to deal with missing values
mean(missings,na.rm=T) # should not available numbers be removed?
vector3 <- c(vector1,vector2) # c() can also be used to 'concatenate' two vectors
vector3
vector4 <- c(vector2,vector1) # the order of the vectors inside c() matters
vector4
1:100 # creates a sequence from 1 to 9
seq(from=1,to=100,by=1)
seq(1,100,1) # if you don't name your function arguments, make sure they are in the right place
?seq
seq(1,100,20)
seq(1,100,length.out=2)
rep(2,times=5) # creates a vector that reproduces 2 five times
rep(seq(1,4),times=2) # reproduces the sequence twice
rep(seq(1,4),each=2) # reproduces each element of the sequence twice
x <- c(1:8)
dim(x) <- c(2,4) # assigns a certain dimension to a vector and turns it into a matrix
x
dim(x)
matrix(data=1:15,nrow=3,ncol=5) # creates a matrix with the elements ranging from 1 to 15 ordered in three rows and five columns
matrix(data=1:15,nrow=5,ncol=3) # creates a matrix with the elements ranging from 1 to 15 ordered in five rows and three columns
matrix(data=1:15,nrow=5,ncol=3,byrow=T) # fills the matrix by row, not by column (default)
matrix(1,nrow=5,ncol=3) # creates a matrix with the elements ranging from 1 to 15 ordered in five rows and three columns
diag(1,5,5)
matrix1 <- cbind(vector3,vector4) # column bind ... creates a matrix with two columns and preserves the names
matrix1
matrix1 <- cbind(x=vector3,y=vector4) # you can name the columns yourself
matrix1
matrix2 <- rbind(vector3,vector4) # row bind ... creates a matrix with two rows
matrix2
matrix2 c(1,2) # we can multiply a matrix with a vector
matrix1 c(1,2) # first column by first element of the vector, second by second ...
matrix2 * c(1,2,3) # sometimes the result doesn't make much sense: R 'recycles' the vector
df1 <- data.frame(x1=vector3,
x2=vector4,
x3=seq(from=1,to=length(vector4)),
x4=seq(1,50,length.out=length(vector4))) # pass variable names on to the data frame
names(df1)
names(df1) <- c('x1','x2','x3','x4') # names of data frames can also be assigned later on
df1$country <- rep(c('Mexico','Sweden','Japan'),each=5) # alternatively, variables can be added using '$'
df1[,'country'] #show only the column with the country names
unique(df1[,'country'])
?expand.grid # creates a data frame from all combinations of the supplied input vectors
expand.grid(x=c(1,3,5),y=c('low','middle','high'))
vector1[2] # indexing a vector to show only its second element
vector1[3:6] # give me elements 3 to 6
vector1[vector1 < 5] # logicals can be used for indexing as well - only values of vector1 that are smaller than 5
my_indx <- vector1 < 5 # create a separate index object
vector1[my_indx]
vector1[12] # does my vector even have that many elements?
length(vector1)
-- indexing a matrix with [] --
matrix1[2,3] # failed attempt to index a matrix (only has two columns!)
dim(matrix1) # dimension of matrix1
nrow(matrix1) # number of rows of matrix1
ncol(matrix1) # number of columns of matrix1
matrix1[3,2] # third row element of the second column of matrix1
matrix1[c(3,5,9),2] # third, fifth, and ninth row element of the second column of matrix1
indx <- 3:9 # create separate index to use inside matrix in next line
matrix1[indx,2] # third to the ninth row element of the second column of matrix1
matrix1[,2] # specifying just one of row- or column-index gives the entire column or row
-- indexing with names
e0 # vector with life expectancies at birth
names(e0)
names(e0) <- country1
e0['Japan']
matrix1['1980','y'] # we can use row and column names (if available) for indexing
matrix1[,'y']
df1
df1[,'country'] #indexing data frames
-- some useful functions for indexing/extracting data --
attach(df1) # makes the variables within df1 globally available
x1
mean(x1)
x4 <- x1 + x2 + x3 5
detach(df1) # to avoid confusion, the data frame needs to be detached again
x1 # now no longer available in the global environment
x4 # still available because it was created globally
df1$x4 <- x4 # '$' can also be used to add further variables to data frames
df1
x2
attach(df1)
with(df1,x1 + x4 - x2 3 + x1) # does what it says 'with' some object
with(df1,table(country))
with(df1,table(country,x4))
with(df1,paste(country,x1,sep=''))
df1$x5 <- with(df1,paste(country,x1,sep=''))
df1
df2 <- transform(df1,x6 = x4 ^ 2 - sqrt(x3)) # allows you to add variables to a data frame transforming its other variables
df2
df2 <- within(df2,{x7 <- x1 + 2 - x3 * 4 + x6 # alternative to transform(), but a bit more complicated. Evaluates an expression 'within' an object
x8 <- paste(country,x4,sep='_')
x9 <- mean(x1)})
df2
df3 <- transform(df2,x10=x9-1)
df3
DataMort3
subset(DataMort3,lifexp > 84) # pick only those countries where life-expectancy is bigger than 84 years
subset(DataMort3,IMR < 0.003) # countries where infant mortality is smaller than 0.003
subset(DataMort3,IMR < 0.003 & lifexp > 84) # both selection criteria combined
subset(DataMort3,IMR < 0.003,select=c(country,IMR)) # keep only variables 'country' and 'IMR'
subset(DataMort3,IMR < 0.003,select=-c(capital))
subset(DataMort3,IMR < 0.003,select=-c(capital,lifexp)) # drop variables 'capital' and 'lifexp'
subset(DataMort3,country %in% 'Japan')
subset(DataMort3,country %in% c('Sweden','Japan')) # checks the variable country one by one if its value is in this vector
df1$x2 # '$' gives you one column of a data frame
df1$x1 + df1$x2
lm(x5x1+x2+country,data=df1)
mylist
str(mylist) # seeing the list structure helps indexing it
mylist$my_dataframe # lists can be indexed with a '$' sign
mylist[[3]] # or using double square brackets []
mylist$my_dataframe$country
mylist[[3]]$country
mylist[[3]][,5]
length(vector1) #returns the length of a vector
nrow(matrix1) #returns number of rows of a matrix or data frame
ncol(matrix1) #returns number of columns of a matrix or data frame
dim(matrix1) #returns number of dimensionality of a matrix or data frame
min(vector1) #returns the minimum of an object
max(vector1) #returns the maximum of an object
range(vector1) #returns the minimum and the maximum of an object
range(matrix1,na.rm=T)
vector9 <- c(1,4,12,3,NA,5,NA,8)
summary(vector1) # generic function = output depends on input object's type
quantile(vector4) # this is just the same as empirical quantiles from quantile()
quantile(vector4,probs=c(0,0.5,1)) # you can also set the quantiles yourself
summary(matrix1)
quantile(vector4,probs=c(0,0.5,1))
quantile(vector4,probs=c(0,0.2,1))
which.min(vector4) # returns the position of the minimum within a vector
which.max(vector4) # returns the position of the maximum within a vector
vector4[which.max(vector4)]
df3
which.max(df3$x4)
wm <- with(df3,which.max(x4))
df3[wm,]
grep(7,1:9) # gives the position of a certain value within a vector
grep('b',c('a','b','c')) # also works with character data
grep('b',c('a','b','b','c')) # it returns all the occurences
grep('b',c('a','b','ba','c')) # also finds strings that contain a specific pattern as an element - do we want that?
match('b',c('a','b','ba','c')) # If you want the exact pattern, use match()
grep('b',c('a','b','ba','c'))
abs(vector5) #returns the absolute value(s)
round(vector5,digits=3) #rounds to a given number of digits
round(vector5,digits=1) #rounds to a given number of digits
floor(vector510) #returns the next smaller whole number
ceiling(vector510) #returns the next bigger whole number
with(df3,cov(x1,x3))
x1
x3
vector1
rev(vector1)
vector3
sort(vector3)
with(df3,cor(x1,x3))
sum(vector1) #sums up all elements of vector1
rowSums(matrix1) # computes the sum of each row within a matrix or data frame
colSums(matrix1) # computes the sum of each column within a matrix or data frame
ptab <- prop.table(matrix1, margin=NULL) # computes a table of proportions (margins can be defined)
sum(ptab)
ptab1 <- prop.table(matrix1, margin=1) # proportion within row
rowSums(ptab1) # rows sum up to 1
ptab2 <- prop.table(matrix1, margin=2) # proportion within column
colSums(ptab2) # columns sum up to 1
mean(vector1) # computes the mean of vector1
sd(vector1) # computes the standard deviation
var(vector1) # computes the variance
cov(df1$x2,df1$x4) # computes the covariance between two variables
cor(df1$x2,df1$x4) # computes the correlation between two variables
identical(pop_india2,pop_india) # logical function to test if two objects are identical
rm(pop_india2)
-- Writing/Saving Data in R --
write.table(x=pop_india,file='pop_india.txt',sep=',') # writes a table into a comma-separated text file
pop_india.txt <- read.table(file='pop_india.txt',header=T,sep=',') # reads a text file
head(pop_india.txt)
save(pop_india,file='pop_india.rda')
rm(pop_india) # removes the data frame
pop_india # this should give an error now
load(file='pop_india.rda')
head(pop_india)
save(mylist,file='mylist.rda') # also saves entire lists and other complex structures
rm(mylist)
load('mylist.rda')
mylist
ls() # shows you the objects currently in the workspace
save.image('my first steps in R.Rdata') # saves the entire workspace
rm(list=ls());gc() # cleaning up the workspace and garbage collection (gc)
load('my first steps in R.Rdata') # retrieve previously saved workspace
ls()
#############################################################################################
?plot
?par
par() # shows you all the high-level parameters that can be changed
par('mfrow')
par(mfrow=c(2,2)) # changes the number of plots in one graphic device
par('mfrow')
z <- seq(-3,3,0.1)
d <- dnorm(z) # evaluates the density function for each value in z
?dnorm
plot(d)
plot(x=z,y=d)
plot(z,d,type='l') # high-level plot function producing the structure of the plot
title('Normal Distribution') # low-level function adding a title to an existing graphics window
abline(v=0,lty=2,col='red') # adds a vertical 'red' line at 0, defines the line-type as 'dotted'
abline(v=0,lty=2,col=1) # colors can also be defined by numbers
abline(v=0,lty=2,col=2)
abline(v=0,lty=2,col=8)
abline(v=0,lty=2,col=9) # returns to 'black'
colors() # gives you an overview of all the available colors (or just google "R colors")
dev.off() # shuts down the currently active graphical device
graphics.off() # shuts down all graphical devices opened in this session
with(asfr_total,plot(total[year=='2095-2100'],type='b',lwd=2,xlab='Age',ylab='ASFR',xaxt='n')) # plotting the asfr for 2095-2100
with(asfr_total,axis(side=1,labels=fertages,at=1:length(fertages))) # adding a self-defined axis
with(asfr_total,plot(total[year=='2010-2015'],type='b',lwd=2,xlab='Age',ylab='ASFR',xaxt='n'))
with(asfr_total,axis(side=1,labels=fertages,at=1:length(fertages)))
with(asfr_total,lines(total[year=='2095-2100'],type='b',lty=2,col=2)) # adding lines to an existing plot
with(asfr_total,lines(total[year=='2055-2060'],type='b',lty=3,col=3))
legend(x='topright',inset=c(0.04,0.02),legend=paste(c(2010,2055,2095),c(2015,2060,2100),sep='-'),lty=c(1,3,2),col=c(1,3,2),lwd=c(2,1,1),bg='grey')
grid(nx=NULL,ny=NULL,col='lightgrey') # plots over your nice legend - order matters!
legend(x='topright',inset=c(0.04,0.02),legend=paste(c(2010,2055,2095),c(2015,2060,2100),sep='-'),lty=c(1,3,2),col=c(1,3,2),lwd=c(2,1,1),)
asfrbyedu2100 <- subset(asfr,year=='2095-2100' & age %in% fertages,select=no.education:post.secondary) # y-axis values are not the same
matplot(asfrbyedu2100,type='l',col=1:6,lty=1,lwd=2,xlab='age',ylab='asfr',xaxt='n')
axis(1,1:7,fertages)
matplot(asfrbyedu2010,type='l',col=1:6,lty=1,lwd=2,xlab='age',ylab='asfr',xaxt='n')
matlines(asfrbyedu2100,col=1:6,lty=2) # adds multiple lines to an existing plot
legend(legend=names(asfrbyedu2100),lty=1,lwd=2,col=1:6,'topright',inset=0.02,title='Education Group',cex=0.6)
-- scattplots with plot() --
pwt <- read.table('pwt7.1.txt',sep=',',na.strings='na',header=T) # reading in Penn World Table data
head(pwt)
str(pwt)
pwt2010 <- subset(pwt,year==2010 & !is.na(rgdpch)) # we only want to plot results for those countries that report GDP per capita
par(mfrow=c(2,2))
with(pwt2010,plot(x=rgdpch,y=openc)) # with the data frame pwt2010, plot 'rgdpch' against 'openc'
with(pwt2010,plot(x=rgdpch,y=openc,xlab='Real GDP per capita',ylab='Openness',main="I'm so proud of my first graph")) # adding x- and y-axis labels
gdp_range <- c(0,50000)
open_range <- c(0,200)
with(pwt2010,plot(rgdpch,openc,xlab='Real GDP per capita',ylab='Openness',main="I'm so proud of my first graph",
xlim=gdp_range,ylim=open_range)) # specifying the range of the x and y axis
with(pwt2010,plot(rgdpch,openc,xlab='Real GDP per capita (in 1000s)',ylab='Openness',main="...although it is meaningless",
xlim=gdp_range,ylim=open_range,xaxt='n', # remove x-axis
pch=19,cex=0.6)) # change the points symbols - for details google "pch symbols R"
xat <- seq(0,50000,10000)
xlab <- xat / 1000 # we don't like huge numbers, that's why we plot the data in 1000s
axis(side=1,at=xat,labels=xlab) # add your own x-axis
grid(nx=NULL,ny=NULL,col='lightgrey') # by not specifying the number of grid-lines, R automatically alignes them with the tick marks
with(pwt2010,text(x=rgdpch,y=openc,labels=country.isocode,cex=0.7,pos=3)) # adding point lables to plot
-- other high-level plotting functions --
hist(pwt2010$rgdpch) # produces a histogram
with(pwt2010,hist(rgdpch,breaks=10))
with(pwt2010,plot(rgdpch,openc,xlab='Real GDP per capita (in 1000s)',ylab='Openness',main="... and now it's in PDF!!!",
xlim=gdp_range,ylim=open_range,xaxt='n', # remove x-axis
pch=19,cex=0.6)) # change the points symbols - for details google "pch symbols R"
xat <- seq(0,50000,10000)
xlab <- xat / 1000 # we don't like huge numbers, that's why we plot the data in 1000s
axis(side=1,at=xat,labels=xlab) # add your own x-axis
grid(nx=NULL,ny=NULL,col='lightgrey') # by not specifying the number of grid-lines, R automatically alignes them with the tick marks
with(pwt2010,text(x=rgdpch,y=openc,labels=country.isocode,cex=0.7,pos=3)) # adding point lables to plot
with(pwt2010,lines(lowess(x=rgdpch,y=openc),col=3,lty=2,lwd=2)) # add a locally-smoothed trendline to our previous graph
with(pwt2010,plot(rgdpch,openc,xlab='Real GDP per capita (in 1000s)',ylab='Openness',main="...jpeg",
xlim=gdp_range,ylim=open_range,xaxt='n', # remove x-axis
pch=19,cex=0.6)) # change the points symbols - for details google "pch symbols R"
xat <- seq(0,50000,10000)
xlab <- xat / 1000 # we don't like huge numbers, that's why we plot the data in 1000s
axis(side=1,at=xat,labels=xlab) # add your own x-axis
grid(nx=NULL,ny=NULL,col='lightgrey') # by not specifying the number of grid-lines, R automatically alignes them with the tick marks
with(pwt2010,text(x=rgdpch,y=openc,labels=country.isocode,cex=0.7,pos=3)) # adding point lables to plot
?lm
lm1 <- lm(rgdpch~openc,data=pwt2010) # fits a standard linear regression with implicit intercept term
lm1 <- lm(rgdpch~openc-1,data=pwt2010) # subtracting 1 in the formula removes the intercept
summary(lm1)
attach(pwt2010)
lm(rgdpch~openc) # if the data frame is attached, you don't need the 'data' argument of lm()
detach(pwt2010)
lm1 # lm1 is a fitted model object
str(lm1) # lm1 includes loads of 'hidden' information
names(lm1)
coefficients(lm1)
residuals(lm1) # extract the residuals from your regression result
fitted.values(lm1) # fitted values. For linear models, predict(lm1) yields the same
fitted(lm1)
coef(lm1)
deviance(lm1) # returns the residual sum of square
formula(lm1) # shows you just the formula that you estimated
summary(lm1) # shows a summary of your regression result
plot(lm1) # by default shows four diagnostic plots for the regression
par(mfrow=c(2,2))
plot(lm1) # by default shows four diagnostic plots for the regression
with(pwt2010,abline(lm(openc ~ rgdpch),col=2,lty=3,lwd=2)) # adds a trendline to the previous scatterplot
lm2 <- lm(rgdpch~openc+POP+cc+cg+ci,data=pwt2010) # multivariate linear regression
summary(lm2)
lm3 <- update(lm2,.~.-POP-ci) # adds or removes regression components from an existing fitted object
summary(lm3)
anova(lm3) # compares the model with a basic model with only an intercept term
lm0 <- lm(rgdpch~1,data=pwt2010) # fits a regression with with only an intercept term
summary(lm0)
anova(lm0,lm3) # the same as anova(lm3) above
anova(lm1,lm3) # our more sophisticated model outperforms the plotted model
-- model selection --
AIC(lm1,lm3)
BIC(lm1,lm3)
drop1(lm3) # goes from general to specific and tells you not to drop any of the variables
add1(lm0,scope= ~ openc + I(openc^2) + ci + cg) # adds one at a time from the given scope
?nlm # non-linear models
?glm # general linear models, i.e. binomial, logistic, poisson regression ...
?lowess # local approximation model
with(pwt2010,lowess(x=rgdpch,y=openc))
with(pwt2010,lines(lowess(x=rgdpch,y=openc),col=3,lty=2,lwd=2)) # add a locally-smoothed trendline to our previous graph
life.expectancy <- 70
if (life.expectancy == 70) print('Life expectancy has reached 70 years') # if ( condition is TRUE ) do_this
life.expectancy <- 68
if (life.expectancy == 70) print('Life expectancy has reached 70 years')
if (life.expectancy >= 70) print(paste('Life expectancy has reached ',life.expectancy,' years',sep=''))
if (life.expectancy < 70) print(paste('Life expectancy has reached ',life.expectancy,' years',sep=''))
conditional statements can be long and complex but they then require {}
le.lag <- NULL # initialization
if (life.expectancy < 70){
print(paste('Life expectancy has reached ',life.expectancy,' years',sep=''))
le.lag <- round(life.expectancy,-1) - life.expectancy # How many years missing to 70?
}
le.lag
if statement can be complemented with 'else' in case condition is not met
life.expectancy <- 78
if (life.expectancy < 70){ # conditional statements can be long and complex but they then require {}
print(paste('Life expectancy has reached ',life.expectancy,' years',sep=''))
le.lag <- round(life.expectancy,-1) - life.expectancy # How many years missing to 70?
} else {le.lag <- 'life expectancy is greater or equal to 70'}
le.lag
-- repetitive execution --
countries <- c('DK','IT','FR')
for (i in countries) print(i) # for each i (the counter) in sequence of characters do something
for (i in 1:3) print(countries[i]) # for each i in sequence of numeric values do something
the statement to be executed repetitively can be more complex and spread over several lines
for (i in countries){
print(i)
idat <- subset(dat,country==i)
print(with(idat,table(sex)))
}
with(dat,table(country,sex))
for-loops and if-statements can be combined
for (i in seq(2,8,2)){
if (i < 5){
print(i)
print('this number is too small')
} else {
print(i)
print('from here on we round up')}
}
for (i in seq(2,8,2)){
if (i < 5){
some_statement <- 'apples'
} else {
some_statement <- 'oranges'
}
print(some_statement)
}
you can set an additional counter apart from the loop-counter
counter <- 0
for (i in seq(0.1,0.5,0.1)){
counter <- counter + 1
print(paste('The value of my counter is ',counter,sep=''))
print(paste('... although the loop-counter is only ',i,sep=''))
}
loops become particularly useful when modeling time-series
h <- seq(from=1, to=8)
s <- NULL
s[1] <- h[1] 10 # you could do this one by one
s[2] <- h[2] 10
for(i in 3:10){ # or run a loop over it
s[i] <- h[i] * 10
}
s
this can be done with huge numbers of repetitions in no time
nr.of.experiments <- 500
means.for.randoms <- NULL
for (i in 1:nr.of.experiments) {
random.numbers <- rnorm(100) # create a 100 normally distributed random numbers
means.for.randoms[i] <- mean(random.numbers) # save the mean of those random numbers
}
hist(means.for.randoms) # create a histogram of those random means
#############################################################################################
hello <- function(){print('Hello World')} # function without argument always does the same
hello()
hello_somebody <- function(name){print(paste('Hello ',name,sep=''))} # function with argument
hello_somebody(name='Samir')
hello_somebody(name=c('Samir','James'))
functions you programmed yourself work exactly like pre-programmed functions
fun1 <- function(arg1, arg2){ # define the necessary inputs of your function
w <- arg1 ^ 2 # define what your function is supposed to do for each input value
return(arg2 + w) # define what your function is supposed to return as a result
}
fun1(arg1 = 3, arg2 = 5) # this is where the function is actually executed
fun1(arg1 = 1:9, arg2 = 5) # functions can also be applied to entire vectors
#############################################################################################
R can be used simply as a calculator
2 * 3 + 5 ^ 2 1:3 (1:3 + 5) / 2 1:3 + 4:6
We can use R to assign values to variables and to use the variables for more complex calculations
Calculate the resulting population after 30 years of constant growth at 2% per year
inipop <- 800 g <- 0.02 time <- 30
endpop <- inipop * (1+g)^time endpop
ls() # once you have created some object, you can check what you have in your workspace rm('endpop') # removes certain objects rm(list=ls()) #removes all objects in the current workspace
#############################################################################################
Using functions in R
vector1 <- c(1,2,3,4,5,6,7,8,9) # c() concatenates these numbers to a vector plot(vector1) mean(vector1) # mean() computes the mean of the input vector ?mean example(mean) # shows examples of what can be done with a function help.search('mean')
#############################################################################################
Programming your own function
my.mean <- sum(vector1) / length(vector1) my.fun <- function(x){sum(x)/length(x)} my.fun(vector1)
vector2 <- c(2,4,6,8,10,12) # functions are universally applicable my.fun(vector2)
to return to our initial example
projfun <- function(inipop,g,time){inipop * (1+g)^time} projfun(100,0.02,35) projfun(50,0.01,10)
#############################################################################################
To load further functions not included in the base-distributions, you need to install and load additional packages
install.packages(c('reshape','plyr'))
library(reshape) # this loads the library containing the functions you are interested in require(plyr) # basically the same as library(), however, this one does not report errors
#############################################################################################
Different data types in R
#############################################################################################
-- numeric data --
vector1 mode(vector1) #
vector1 + 4 # basic arithmetic operations can be applied to numeric data ... vector1 * 4 vector1 / 4
vector1sq <- vector1 ^ 2 # square vector1 sqrt(vector1sq) # take square root exp(vector1) # exponential function log(vector1) # log function with Euler base as default #############################################################################################
-- character data --
"asdf123.-4" # everything that's between "" or '' is a called a 'string'; character data is made up of such strings c("asdf123.-4","bdl0320d;./") # we can create vectors of strings educ <- c("Prim", "Ter", "Sec", "Sec", "Ter", "Prim") # we can assign such vectors into objects mode(educ)
It is not possible to calculate with character data, but one can do other things
educ + 4 toupper(educ) # capitalizes all letters tolower('ERICH') substr(educ,start=1,stop=3) # return sub-string ranging from position 1 to 3 substr(educ,1,1) substr(educ,1,1) <- 'X' # can also be used for replacing parts of strings educ
paste(c('a','b','c'),1:3,sep='_') # concatenates vectors after creating character strings paste(c('a','b','c'),1:3,sep='-',collapse=', ') # collapses the vector into one string using the desired separator
agegr <- paste(c(0,16,65),c(15,64,99),sep='-') agegr paste(agegr,collapse=', ') #############################################################################################
-- logical data --
my_logical <- vector1 > 5 # creates a logical vector mode(my_logical)
my_logical + 4 # in arithmetic operations, logical values will be interpreted as 0 or 1 c(1,2,3) * my_logical
logical operators
vector1 < 4 vector1 <= 4 vector1 > 4 vector1 >= 4 vector1 == 4 vector1 != 4 (vector1 > 3) & (vector1 < 8) # which ones are greater than 3 AND smaller than 8 (vector1 < 3) | (vector1 > 8) # which ones are smaller than 3 OR greater than 8
is.element(4,vector1) # Is 4 an element of vector1? 4 %in% vector1 # Same as above c(4,5,15) %in% vector1 # Which one of 4, 5, and 15 can be found in vector1? !c(4,5,15) %in% vector1 # Which of these numbers can NOT be found in vector1? 'b' %in% c('a','a','a','b','a') #############################################################################################
-- factorial data --
sex <- c("male","male","female","male","female") # this is a character object sex <- factor(sex) # this transforms it to a factor levels(sex)
my_factor <- factor(educ) # transforming our character object to a factor my_factor str(my_factor) # this is now a factor with three levels levels(my_factor) # the levels can be accessed separately #############################################################################################
-- missing data --
missings <- c(1,3,NA,2,4,NA,9) is.na(missings) # Which values of this vector are missing values? mean(missings) # many functions have an option to deal with missing values mean(missings,na.rm=T) # should not available numbers be removed?
placeholder <- NULL placeholder[1] <- 'Austria' placeholder[2] <- 'Sweden' placeholder[4] <- 'Russia' placeholder
other ways of getting either missing or infinite values
0/0 1/0 -1/0 #############################################################################################
R objects
#############################################################################################
-- vectors --
vector3 <- c(vector1,vector2) # c() can also be used to 'concatenate' two vectors vector3
vector4 <- c(vector2,vector1) # the order of the vectors inside c() matters vector4
1:100 # creates a sequence from 1 to 9 seq(from=1,to=100,by=1) seq(1,100,1) # if you don't name your function arguments, make sure they are in the right place ?seq seq(1,100,20) seq(1,100,length.out=2)
rep(2,times=5) # creates a vector that reproduces 2 five times rep(seq(1,4),times=2) # reproduces the sequence twice rep(seq(1,4),each=2) # reproduces each element of the sequence twice
#############################################################################################
-- matrices --
x <- c(1:8) dim(x) <- c(2,4) # assigns a certain dimension to a vector and turns it into a matrix x dim(x)
matrix(data=1:15,nrow=3,ncol=5) # creates a matrix with the elements ranging from 1 to 15 ordered in three rows and five columns matrix(data=1:15,nrow=5,ncol=3) # creates a matrix with the elements ranging from 1 to 15 ordered in five rows and three columns matrix(data=1:15,nrow=5,ncol=3,byrow=T) # fills the matrix by row, not by column (default) matrix(1,nrow=5,ncol=3) # creates a matrix with the elements ranging from 1 to 15 ordered in five rows and three columns
diag(1,5,5)
matrix1 <- cbind(vector3,vector4) # column bind ... creates a matrix with two columns and preserves the names matrix1 matrix1 <- cbind(x=vector3,y=vector4) # you can name the columns yourself matrix1 matrix2 <- rbind(vector3,vector4) # row bind ... creates a matrix with two rows matrix2
matrix2 c(1,2) # we can multiply a matrix with a vector matrix1 c(1,2) # first column by first element of the vector, second by second ...
matrix2 * c(1,2,3) # sometimes the result doesn't make much sense: R 'recycles' the vector
for actual matrix multiplication, use %*%
matrix1 %% c(1,2) # 15x2 2x1 = 15x1
m1 <- matrix(1,2,3) m2 <- matrix(1:6,2,3) m1 %% m2 # non-conformable m1 %% t(m2) # 2x3 * 3x2 = 2x2
matrix(1:6,2,3) %% matrix(1:12,3,4) # 2x3 3x4 = 2x4
#############################################################################################
-- data frames --
a data frame can contain both numeric and character variables
country1 <- c("USA", "Japan", "Sweden", "France") e0 <- c(80.69, 86.04, 83.12, 84.36) DataMort <- data.frame(country=country1, lifexp=e0) DataMort str(DataMort)
as.data.frame(matrix1) # a data frame can be created from a matrix
data.frame(vector3,vector4) # creates a data frame with two variables: vector1 and vector2
df1 <- data.frame(vector3, vector4, seq(from=1,to=length(vector4)), seq(1,50,length.out=length(vector4))) df1
df1 <- data.frame(x1=vector3, x2=vector4, x3=seq(from=1,to=length(vector4)), x4=seq(1,50,length.out=length(vector4))) # pass variable names on to the data frame names(df1)
names(df1) <- c('x1','x2','x3','x4') # names of data frames can also be assigned later on
df1$country <- rep(c('Mexico','Sweden','Japan'),each=5) # alternatively, variables can be added using '$' df1[,'country'] #show only the column with the country names unique(df1[,'country'])
?expand.grid # creates a data frame from all combinations of the supplied input vectors expand.grid(x=c(1,3,5),y=c('low','middle','high'))
merging two data frames
capitals <- c('Washington','Tokio','Stockholm','Paris') m0 <- c(0.0062, 0.0025, 0.0025, 0.0034) DataMort2 <- data.frame(country=country1, capital=capitals, IMR=m0) DataMort2
DataMort3 <- merge(DataMort,DataMort2) # combines two data frames based on their joint columns DataMort3
#############################################################################################
-- lists --
vector1 matrix2 df1 data_source <- 'Erich`s fantasy'
mylist <- list(my_vector=vector1, my_matrix=matrix2, my_dataframe=df1, source=data_source) mylist #############################################################################################
Exercises
#############################################################################################
Indexing in R
#############################################################################################
-- indexing a vector with [] --
vector1[2] # indexing a vector to show only its second element
vector1[3:6] # give me elements 3 to 6
vector1[vector1 < 5] # logicals can be used for indexing as well - only values of vector1 that are smaller than 5 my_indx <- vector1 < 5 # create a separate index object vector1[my_indx]
vector1[12] # does my vector even have that many elements? length(vector1)
-- indexing a matrix with [] --
matrix1[2,3] # failed attempt to index a matrix (only has two columns!) dim(matrix1) # dimension of matrix1 nrow(matrix1) # number of rows of matrix1 ncol(matrix1) # number of columns of matrix1
matrix1[3,2] # third row element of the second column of matrix1 matrix1[c(3,5,9),2] # third, fifth, and ninth row element of the second column of matrix1
indx <- 3:9 # create separate index to use inside matrix in next line matrix1[indx,2] # third to the ninth row element of the second column of matrix1 matrix1[,2] # specifying just one of row- or column-index gives the entire column or row
-- indexing with names
e0 # vector with life expectancies at birth names(e0) names(e0) <- country1 e0['Japan']
matrix1 colnames(matrix1) rownames(matrix1) rownames(matrix1) <- 1970:1984
matrix1['1980','y'] # we can use row and column names (if available) for indexing matrix1[,'y']
df1 df1[,'country'] #indexing data frames
-- some useful functions for indexing/extracting data --
attach(df1) # makes the variables within df1 globally available x1 mean(x1) x4 <- x1 + x2 + x3 5 detach(df1) # to avoid confusion, the data frame needs to be detached again x1 # now no longer available in the global environment x4 # still available because it was created globally df1$x4 <- x4 # '$' can also be used to add further variables to data frames df1 x2 attach(df1) with(df1,x1 + x4 - x2 3 + x1) # does what it says 'with' some object with(df1,table(country)) with(df1,table(country,x4)) with(df1,paste(country,x1,sep='')) df1$x5 <- with(df1,paste(country,x1,sep='')) df1
df2 <- transform(df1,x6 = x4 ^ 2 - sqrt(x3)) # allows you to add variables to a data frame transforming its other variables df2
df2 <- within(df2,{x7 <- x1 + 2 - x3 * 4 + x6 # alternative to transform(), but a bit more complicated. Evaluates an expression 'within' an object x8 <- paste(country,x4,sep='_') x9 <- mean(x1)}) df2 df3 <- transform(df2,x10=x9-1) df3
with(df1,x1my.mean) with(df1,x4my.mean) with(df3,mean(x4)) mean(df3[,'x4']) with(df3,table(country)) with(df3,table(country,x4)) table(df3$country, df3$x4)
within(df3, { x11<- x1+1.5 x4*2 x12<- mean(x1) x13<- paste(country,rep(1:5,times=3)x1,sep'-') })
subset(x=df3,country='Sweden')
DataMort3 subset(DataMort3,lifexp > 84) # pick only those countries where life-expectancy is bigger than 84 years subset(DataMort3,IMR < 0.003) # countries where infant mortality is smaller than 0.003 subset(DataMort3,IMR < 0.003 & lifexp > 84) # both selection criteria combined subset(DataMort3,IMR < 0.003,select=c(country,IMR)) # keep only variables 'country' and 'IMR' subset(DataMort3,IMR < 0.003,select=-c(capital)) subset(DataMort3,IMR < 0.003,select=-c(capital,lifexp)) # drop variables 'capital' and 'lifexp' subset(DataMort3,country %in% 'Japan') subset(DataMort3,country %in% c('Sweden','Japan')) # checks the variable country one by one if its value is in this vector
subset(DataMort3,country %in% c('Sweden','Japan') & lifexp<85) mylist str(mylist) mylist$my_matrix mylist$source mylist[[2]][1,3] mylist[[3]]
vector1 min(vector1) min(matrix1) range(matrix1)
-- indexing data frames and lists with '$' --
df1$x2 # '$' gives you one column of a data frame df1$x1 + df1$x2
lm(x5x1+x2+country,data=df1)
mylist str(mylist) # seeing the list structure helps indexing it mylist$my_dataframe # lists can be indexed with a '$' sign mylist[[3]] # or using double square brackets [] mylist$my_dataframe$country mylist[[3]]$country mylist[[3]][,5]
#############################################################################################
Basic Functions in R
#############################################################################################
length(vector1) #returns the length of a vector nrow(matrix1) #returns number of rows of a matrix or data frame ncol(matrix1) #returns number of columns of a matrix or data frame dim(matrix1) #returns number of dimensionality of a matrix or data frame
min(vector1) #returns the minimum of an object max(vector1) #returns the maximum of an object range(vector1) #returns the minimum and the maximum of an object range(matrix1,na.rm=T) vector9 <- c(1,4,12,3,NA,5,NA,8)
vector9 max(vector9) max(vector9,na.rm=T) summary(matrix1) summary(df1)
summary(vector1) # generic function = output depends on input object's type quantile(vector4) # this is just the same as empirical quantiles from quantile() quantile(vector4,probs=c(0,0.5,1)) # you can also set the quantiles yourself summary(matrix1) quantile(vector4,probs=c(0,0.5,1)) quantile(vector4,probs=c(0,0.2,1))
which.min(vector4) # returns the position of the minimum within a vector which.max(vector4) # returns the position of the maximum within a vector vector4[which.max(vector4)] df3 which.max(df3$x4) wm <- with(df3,which.max(x4)) df3[wm,] grep(7,1:9) # gives the position of a certain value within a vector grep('b',c('a','b','c')) # also works with character data grep('b',c('a','b','b','c')) # it returns all the occurences grep('b',c('a','b','ba','c')) # also finds strings that contain a specific pattern as an element - do we want that? match('b',c('a','b','ba','c')) # If you want the exact pattern, use match() grep('b',c('a','b','ba','c'))
vector5 <- seq(-0.5,0.5,0.08) vector5 vector14 <- seq(.5,1,.008) vector14 round(vector14,digits=1) ceiling(vector1410) floor(vector1410) ?floor
abs(vector5) #returns the absolute value(s) round(vector5,digits=3) #rounds to a given number of digits round(vector5,digits=1) #rounds to a given number of digits floor(vector510) #returns the next smaller whole number ceiling(vector510) #returns the next bigger whole number with(df3,cov(x1,x3)) x1 x3 vector1 rev(vector1) vector3 sort(vector3)
sort(vector3, decreasing=T) index2 <-order(df1$x3) df3[index2,]
install.packages('plyr') library(Rcpp) library(plyr) index2 <-order(df1$x2) df1[index2,] arrange(df1,x2) arrange(df1,-x2)
with(df3,cor(x1,x3)) sum(vector1) #sums up all elements of vector1 rowSums(matrix1) # computes the sum of each row within a matrix or data frame colSums(matrix1) # computes the sum of each column within a matrix or data frame
ptab <- prop.table(matrix1, margin=NULL) # computes a table of proportions (margins can be defined) sum(ptab) ptab1 <- prop.table(matrix1, margin=1) # proportion within row rowSums(ptab1) # rows sum up to 1 ptab2 <- prop.table(matrix1, margin=2) # proportion within column colSums(ptab2) # columns sum up to 1
mean(vector1) # computes the mean of vector1 sd(vector1) # computes the standard deviation var(vector1) # computes the variance
cov(df1$x2,df1$x4) # computes the covariance between two variables cor(df1$x2,df1$x4) # computes the correlation between two variables
#############################################################################################
Sorting data in R
#############################################################################################
vector1 # is already sorted, but what if we want to revert the order? rev(vector1) sort(vector1,decreasing=T)
sort(vector4) # by default, sort() orders in ascending order sort(vector4,decreasing=T)
df1 df1[order(df1$x2),]
install.packages('plyr')
library(plyr) arrange(df1,x2) # sorts the data frame by column x2 smallest to largest arrange(df1,-x2) # same thing but largest to smallest
#############################################################################################
Exercises day2
vector15<- seq(20,1000,length=15) vector15 min(vector15) min(vector15) range(vector15) matrix20 <- matrix(data=1:200,nrow=20,ncol=10) matrix20 rowsums(matrix20)
#############################################################################################
Reading data into R
#############################################################################################
install.packages('foreign') # for reading in data formats from other software
library(foreign) # have a look at the package help file to see which formats are available
getwd() # asks for the current working directory setwd('C:\Dropbox\ESKC\Bangalore\Data') # sets working directory
pop_india <- read.table(file='wicpop.csv',sep=',',skip=8,header=T,na.strings=NA) str(pop_india) head(pop_india) tail(pop_india)
pop_india2 <- read.csv(file='wicpop.csv',skip=8) head(pop_india2)
identical(pop_india2,pop_india) # logical function to test if two objects are identical rm(pop_india2)
-- Writing/Saving Data in R --
write.table(x=pop_india,file='pop_india.txt',sep=',') # writes a table into a comma-separated text file pop_india.txt <- read.table(file='pop_india.txt',header=T,sep=',') # reads a text file head(pop_india.txt)
save(pop_india,file='pop_india.rda') rm(pop_india) # removes the data frame pop_india # this should give an error now load(file='pop_india.rda') head(pop_india)
save(mylist,file='mylist.rda') # also saves entire lists and other complex structures rm(mylist) load('mylist.rda') mylist
ls() # shows you the objects currently in the workspace save.image('my first steps in R.Rdata') # saves the entire workspace rm(list=ls());gc() # cleaning up the workspace and garbage collection (gc) load('my first steps in R.Rdata') # retrieve previously saved workspace ls() #############################################################################################
Exercises
#############################################################################################
Simple graphs in R
#############################################################################################
?plot ?par par() # shows you all the high-level parameters that can be changed par('mfrow') par(mfrow=c(2,2)) # changes the number of plots in one graphic device par('mfrow')
z <- seq(-3,3,0.1) d <- dnorm(z) # evaluates the density function for each value in z ?dnorm plot(d) plot(x=z,y=d) plot(z,d,type='l') # high-level plot function producing the structure of the plot title('Normal Distribution') # low-level function adding a title to an existing graphics window abline(v=0,lty=2,col='red') # adds a vertical 'red' line at 0, defines the line-type as 'dotted' abline(v=0,lty=2,col=1) # colors can also be defined by numbers abline(v=0,lty=2,col=2) abline(v=0,lty=2,col=8) abline(v=0,lty=2,col=9) # returns to 'black'
colors() # gives you an overview of all the available colors (or just google "R colors")
dev.off() # shuts down the currently active graphical device graphics.off() # shuts down all graphical devices opened in this session
-- Let's plot some real data --
asfr <- read.csv('wicasfr.csv',skip=8,na.strings=NA,header=T) fertages <- paste(seq(15,45,5),seq(19,49,5),sep='--') asfr_total <- subset(asfr,age %in% fertages,select=area:total)
with(asfr_total,plot(total[year=='2095-2100'],type='b',lwd=2,xlab='Age',ylab='ASFR',xaxt='n')) # plotting the asfr for 2095-2100 with(asfr_total,axis(side=1,labels=fertages,at=1:length(fertages))) # adding a self-defined axis with(asfr_total,plot(total[year=='2010-2015'],type='b',lwd=2,xlab='Age',ylab='ASFR',xaxt='n')) with(asfr_total,axis(side=1,labels=fertages,at=1:length(fertages)))
with(asfr_total,lines(total[year=='2095-2100'],type='b',lty=2,col=2)) # adding lines to an existing plot with(asfr_total,lines(total[year=='2055-2060'],type='b',lty=3,col=3))
legend(x='topright',inset=c(0.04,0.02),legend=paste(c(2010,2055,2095),c(2015,2060,2100),sep='-'),lty=c(1,3,2),col=c(1,3,2),lwd=c(2,1,1),bg='grey') grid(nx=NULL,ny=NULL,col='lightgrey') # plots over your nice legend - order matters! legend(x='topright',inset=c(0.04,0.02),legend=paste(c(2010,2055,2095),c(2015,2060,2100),sep='-'),lty=c(1,3,2),col=c(1,3,2),lwd=c(2,1,1),)
#############################################################################################
Exercise on line graph
-- matrix-plotting with matplot(), matlines() and matpoints() --
head(asfr) asfrbyedu2010 <- subset(asfr,year=='2010-2015' & age %in% fertages,select=no.education:post.secondary) matplot(asfrbyedu2010) matplot(asfrbyedu2010,type='l',col=1:6,lty=1,lwd=2,xlab='age',ylab='asfr',xaxt='n') axis(1,1:7,fertages)
par(mfrow=c(2,2)) matplot(asfrbyedu2010,type='l',col=1:6,lty=1,lwd=2,xlab='age',ylab='asfr',xaxt='n') axis(1,1:7,fertages)
asfrbyedu2100 <- subset(asfr,year=='2095-2100' & age %in% fertages,select=no.education:post.secondary) # y-axis values are not the same matplot(asfrbyedu2100,type='l',col=1:6,lty=1,lwd=2,xlab='age',ylab='asfr',xaxt='n') axis(1,1:7,fertages)
matplot(asfrbyedu2010,type='l',col=1:6,lty=1,lwd=2,xlab='age',ylab='asfr',xaxt='n') matlines(asfrbyedu2100,col=1:6,lty=2) # adds multiple lines to an existing plot legend(legend=names(asfrbyedu2100),lty=1,lwd=2,col=1:6,'topright',inset=0.02,title='Education Group',cex=0.6)
-- scattplots with plot() --
pwt <- read.table('pwt7.1.txt',sep=',',na.strings='na',header=T) # reading in Penn World Table data head(pwt) str(pwt)
pwt2010 <- subset(pwt,year==2010 & !is.na(rgdpch)) # we only want to plot results for those countries that report GDP per capita
par(mfrow=c(2,2)) with(pwt2010,plot(x=rgdpch,y=openc)) # with the data frame pwt2010, plot 'rgdpch' against 'openc' with(pwt2010,plot(x=rgdpch,y=openc,xlab='Real GDP per capita',ylab='Openness',main="I'm so proud of my first graph")) # adding x- and y-axis labels
gdp_range <- c(0,50000) open_range <- c(0,200)
with(pwt2010,plot(rgdpch,openc,xlab='Real GDP per capita',ylab='Openness',main="I'm so proud of my first graph", xlim=gdp_range,ylim=open_range)) # specifying the range of the x and y axis
with(pwt2010,plot(rgdpch,openc,xlab='Real GDP per capita (in 1000s)',ylab='Openness',main="...although it is meaningless", xlim=gdp_range,ylim=open_range,xaxt='n', # remove x-axis pch=19,cex=0.6)) # change the points symbols - for details google "pch symbols R"
xat <- seq(0,50000,10000) xlab <- xat / 1000 # we don't like huge numbers, that's why we plot the data in 1000s axis(side=1,at=xat,labels=xlab) # add your own x-axis grid(nx=NULL,ny=NULL,col='lightgrey') # by not specifying the number of grid-lines, R automatically alignes them with the tick marks
with(pwt2010,text(x=rgdpch,y=openc,labels=country.isocode,cex=0.7,pos=3)) # adding point lables to plot
-- other high-level plotting functions --
hist(pwt2010$rgdpch) # produces a histogram with(pwt2010,hist(rgdpch,breaks=10))
with(pwt2010,boxplot(rgdpch))
with(pwt2010,barplot(rgdpch))
#############################################################################################
Exercises
#############################################################################################
Plotting on an external device
#############################################################################################
pdf(file="C:/Dropbox/ESKC/Bangalore/Output/Indian asfr.pdf",width=7,height=8,paper="USr",bg="transparent",family="serif")
with(pwt2010,plot(rgdpch,openc,xlab='Real GDP per capita (in 1000s)',ylab='Openness',main="... and now it's in PDF!!!", xlim=gdp_range,ylim=open_range,xaxt='n', # remove x-axis pch=19,cex=0.6)) # change the points symbols - for details google "pch symbols R"
xat <- seq(0,50000,10000) xlab <- xat / 1000 # we don't like huge numbers, that's why we plot the data in 1000s axis(side=1,at=xat,labels=xlab) # add your own x-axis grid(nx=NULL,ny=NULL,col='lightgrey') # by not specifying the number of grid-lines, R automatically alignes them with the tick marks
with(pwt2010,text(x=rgdpch,y=openc,labels=country.isocode,cex=0.7,pos=3)) # adding point lables to plot
with(pwt2010,lines(lowess(x=rgdpch,y=openc),col=3,lty=2,lwd=2)) # add a locally-smoothed trendline to our previous graph
dev.off()
jpeg(filename = "C:/Dropbox/ESKC/Bangalore/Output/Indian asfr.jpg",width = 800, height = 900, units = "px", pointsize = 12, quality = 100,bg = "white",res = 140, family = "serif", restoreConsole = TRUE,type = c("cairo"))
with(pwt2010,plot(rgdpch,openc,xlab='Real GDP per capita (in 1000s)',ylab='Openness',main="...jpeg", xlim=gdp_range,ylim=open_range,xaxt='n', # remove x-axis pch=19,cex=0.6)) # change the points symbols - for details google "pch symbols R"
xat <- seq(0,50000,10000) xlab <- xat / 1000 # we don't like huge numbers, that's why we plot the data in 1000s axis(side=1,at=xat,labels=xlab) # add your own x-axis grid(nx=NULL,ny=NULL,col='lightgrey') # by not specifying the number of grid-lines, R automatically alignes them with the tick marks
with(pwt2010,text(x=rgdpch,y=openc,labels=country.isocode,cex=0.7,pos=3)) # adding point lables to plot
dev.off()
#############################################################################################
Linear regression in R
#############################################################################################
?lm lm1 <- lm(rgdpch~openc,data=pwt2010) # fits a standard linear regression with implicit intercept term
lm1 <- lm(rgdpch~openc-1,data=pwt2010) # subtracting 1 in the formula removes the intercept summary(lm1)
attach(pwt2010) lm(rgdpch~openc) # if the data frame is attached, you don't need the 'data' argument of lm() detach(pwt2010)
lm1 # lm1 is a fitted model object str(lm1) # lm1 includes loads of 'hidden' information names(lm1)
coefficients(lm1) residuals(lm1) # extract the residuals from your regression result fitted.values(lm1) # fitted values. For linear models, predict(lm1) yields the same fitted(lm1) coef(lm1)
deviance(lm1) # returns the residual sum of square formula(lm1) # shows you just the formula that you estimated
summary(lm1) # shows a summary of your regression result
plot(lm1) # by default shows four diagnostic plots for the regression par(mfrow=c(2,2)) plot(lm1) # by default shows four diagnostic plots for the regression
with(pwt2010,abline(lm(openc ~ rgdpch),col=2,lty=3,lwd=2)) # adds a trendline to the previous scatterplot
lm2 <- lm(rgdpch~openc+POP+cc+cg+ci,data=pwt2010) # multivariate linear regression summary(lm2)
lm3 <- update(lm2,.~.-POP-ci) # adds or removes regression components from an existing fitted object summary(lm3)
anova(lm3) # compares the model with a basic model with only an intercept term
lm0 <- lm(rgdpch~1,data=pwt2010) # fits a regression with with only an intercept term summary(lm0)
anova(lm0,lm3) # the same as anova(lm3) above anova(lm1,lm3) # our more sophisticated model outperforms the plotted model
-- model selection --
AIC(lm1,lm3) BIC(lm1,lm3)
drop1(lm3) # goes from general to specific and tells you not to drop any of the variables add1(lm0,scope= ~ openc + I(openc^2) + ci + cg) # adds one at a time from the given scope
?nlm # non-linear models ?glm # general linear models, i.e. binomial, logistic, poisson regression ... ?lowess # local approximation model with(pwt2010,lowess(x=rgdpch,y=openc))
with(pwt2010,lines(lowess(x=rgdpch,y=openc),col=3,lty=2,lwd=2)) # add a locally-smoothed trendline to our previous graph
#############################################################################################
Exercise
#############################################################################################
Programming Tools
#############################################################################################
-- conditionals --
life.expectancy <- 70 if (life.expectancy == 70) print('Life expectancy has reached 70 years') # if ( condition is TRUE ) do_this
life.expectancy <- 68 if (life.expectancy == 70) print('Life expectancy has reached 70 years')
if (life.expectancy >= 70) print(paste('Life expectancy has reached ',life.expectancy,' years',sep='')) if (life.expectancy < 70) print(paste('Life expectancy has reached ',life.expectancy,' years',sep=''))
conditional statements can be long and complex but they then require {}
le.lag <- NULL # initialization if (life.expectancy < 70){ print(paste('Life expectancy has reached ',life.expectancy,' years',sep='')) le.lag <- round(life.expectancy,-1) - life.expectancy # How many years missing to 70? } le.lag
if statement can be complemented with 'else' in case condition is not met
life.expectancy <- 78 if (life.expectancy < 70){ # conditional statements can be long and complex but they then require {} print(paste('Life expectancy has reached ',life.expectancy,' years',sep='')) le.lag <- round(life.expectancy,-1) - life.expectancy # How many years missing to 70? } else {le.lag <- 'life expectancy is greater or equal to 70'} le.lag
-- repetitive execution --
countries <- c('DK','IT','FR') for (i in countries) print(i) # for each i (the counter) in sequence of characters do something for (i in 1:3) print(countries[i]) # for each i in sequence of numeric values do something
the statement to be executed repetitively can be more complex and spread over several lines
for (i in countries){ print(i) idat <- subset(dat,country==i) print(with(idat,table(sex))) }
with(dat,table(country,sex))
for-loops and if-statements can be combined
for (i in seq(2,8,2)){ if (i < 5){ print(i) print('this number is too small') } else { print(i) print('from here on we round up')} }
for (i in seq(2,8,2)){ if (i < 5){ some_statement <- 'apples' } else { some_statement <- 'oranges' } print(some_statement) }
you can set an additional counter apart from the loop-counter
counter <- 0 for (i in seq(0.1,0.5,0.1)){ counter <- counter + 1 print(paste('The value of my counter is ',counter,sep='')) print(paste('... although the loop-counter is only ',i,sep='')) }
loops become particularly useful when modeling time-series
h <- seq(from=1, to=8) s <- NULL
s[1] <- h[1] 10 # you could do this one by one s[2] <- h[2] 10
for(i in 3:10){ # or run a loop over it s[i] <- h[i] * 10 }
s
this can be done with huge numbers of repetitions in no time
nr.of.experiments <- 500 means.for.randoms <- NULL for (i in 1:nr.of.experiments) { random.numbers <- rnorm(100) # create a 100 normally distributed random numbers means.for.randoms[i] <- mean(random.numbers) # save the mean of those random numbers } hist(means.for.randoms) # create a histogram of those random means #############################################################################################
Exercises
#############################################################################################
Programming your own functions
#############################################################################################
hello <- function(){print('Hello World')} # function without argument always does the same hello()
hello_somebody <- function(name){print(paste('Hello ',name,sep=''))} # function with argument hello_somebody(name='Samir') hello_somebody(name=c('Samir','James'))
functions you programmed yourself work exactly like pre-programmed functions
fun1 <- function(arg1, arg2){ # define the necessary inputs of your function w <- arg1 ^ 2 # define what your function is supposed to do for each input value return(arg2 + w) # define what your function is supposed to return as a result }
fun1(arg1 = 3, arg2 = 5) # this is where the function is actually executed fun1(arg1 = 1:9, arg2 = 5) # functions can also be applied to entire vectors