Classroom handouts - Githubissues

ronuraltindag commented 1 year ago

Handouts and the solution codes will be posted here.

ronuraltindag commented 1 year ago

HANDOUT 1

###############################################################################
# list the packages we need and loads them, installs them automatically if we don't have them
# add any package that you need to the list  
need <- c('dplyr','ggplot2','tidyr','httr','wooldridge')

have <- need %in% rownames(installed.packages()) 
if(any(!have)) install.packages(need[!have]) 
invisible(lapply(need, library, character.only=T)) 

#Session -> set directory -> 
getwd()
setwd(getwd()) 

#this clears the workplace
rm(list = ls()) 
#this sets the random number generator seed to my birthday for replication
options(scipen=999)
###############################################################################
#see https://cran.r-project.org/web/packages/wooldridge/wooldridge.pdf
#get the data
data("wage1")
data("bwght")

mean.educ <- mean(wage1$educ)
min.educ <- min(wage1$educ)
max.educ <- max(wage1$educ)
mean.wage.1976  <- mean(wage1$wage)

CPI1976 <- 56.9 
CPI2013 <- 233

mean.wage.2013 <- mean.wage.1976/56.9*233
table(wage1$female)
mean(wage1$female)

bwght$smoker <- as.numeric(bwght$cigs>0)

table(bwght$smoker)
table(bwght$cigs,bwght$smoker)

mean(bwght$cigs)
median(bwght$cigs)
hist(bwght$cigs)

t1 <- bwght %>% 
      group_by(smoker) %>% 
      summarise(mean.cigs = mean(cigs))

mean(bwght$fatheduc, na.rm = T)

sum(is.na(bwght$fatheduc))
sum(!is.na(bwght$fatheduc))

mean(bwght$faminc)
sd(bwght$faminc)

ronuraltindag commented 1 year ago

HANDOUT 2

###############################################################################
# list the packages we need and loads them, installs them automatically if we don't have them
# add any package that you need to the list  
need <- c('glue', 'dplyr','readxl',  'ggplot2','tidyr','AER','scales','mvtnorm', 
          'stargazer','httr', 'repmis','gridExtra')

have <- need %in% rownames(installed.packages()) 
if(any(!have)) install.packages(need[!have]) 
invisible(lapply(need, library, character.only=T)) 

# Save the R script to the assignment 1 folder before this
# To set up the working directory
getwd()
setwd(getwd()) #change getwd() here is you need to set a different working directory

#this clears the workspace
rm(list = ls()) 
#this sets the random number generator seed to my birthday for replication

options(scipen=999)

data(anscombe)

stargazer(anscombe, type="text", digits=2)

cor(anscombe$x1,anscombe$y1)
cor(anscombe$x2,anscombe$y2)
cor(anscombe$x3,anscombe$y3)
cor(anscombe$x4,anscombe$y4)

g1 <- ggplot(data=anscombe) +
  geom_point(aes(x=x1,y=y1), size=3) + 
  theme_bw()

g2 <- ggplot(data=anscombe) +
  geom_point(aes(x=x2,y=y2), size=3) + 
  theme_bw()

g3 <- ggplot(data=anscombe) +
  geom_point(aes(x=x3,y=y3), size=3) + 
  theme_bw()

g4 <- ggplot(data=anscombe) +
  geom_point(aes(x=x4,y=y4), size=3) + 
  theme_bw()

gridExtra::grid.arrange(g1,g2,g3,g4)

df1 <- anscombe %>%
  select(x1,y1)

b0 <- 3.2
b1 <- 0.5

fit1 <- ggplot(data=df1) +
  geom_point(aes(x=x1,y=y1), size=3) + 
  geom_abline(intercept = b0, slope=b1, colour='red') + 
  xlim(0,15) + 
  ylim(0,15) + 
  theme_bw()

df1$y.hat <- b0 +  b1*df1$x1
df1$res <- df1$y1 - df1$y.hat
df1$res.sq <- (df1$res)^2

SSR <- sum(df1$res.sq)

mo1 <- lm(data=df1, formula = y1 ~ x1)
summary(mo1)

cov.xy <- cov(df1$y1,df1$x1)

var.x <- var(df1$x1)

b1.ols <- cov(df1$x1,df1$y1)/var(df1$x1)

b0.ols <- mean(df1$y1) - b1.ols*mean(df1$x1)

fit2 <- ggplot(data=df1) +
  geom_point(aes(x=x1,y=y1), size=3) + 
  geom_abline(intercept = b0.ols, slope=b1.ols, colour='red') + 
  xlim(0,15) + 
  ylim(0,15) + 
  theme_bw()

grid.arrange(fit1,fit2)

df1$y.hat.ols <- b0.ols +  b1.ols*df1$x1
df1$res.ols <- df1$y1 - df1$y.hat.ols
df1$res.sq.ols <- (df1$res.ols)^2

SSR.ols <- sum(df1$res.sq.ols)

df1$ts <- (df1$y1 - mean(df1$y1))^2

df1$es <- (df1$y.hat.ols - mean(df1$y1))^2

ESS <- sum(df1$es)
TSS <- sum(df1$ts)

r.sq <- ESS/TSS

mo1 <- lm(data=df1, formula = y1 ~ x1)
summary(mo1)
stargazer(mo1, type="text")

mean(df1$res.ols)
mean(df1$y.hat.ols)
mean(df1$y1)
sum(df1$res.ols*df1$x1)

print(SSR + ESS) 
print(TSS)

ronuraltindag commented 1 year ago

https://www.dropbox.com/s/fa36x3ayjhve330/Lec3.docx?dl=0

###############################################################################
# list the packages we need and loads them, installs them automatically if we don't have them
# add any package that you need to the list  
need <- c('glue', 'dplyr','readxl',  'ggplot2','tidyr','AER','scales','mvtnorm', 
          'stargazer','httr', 'repmis','gridExtra', 'wooldridge')

have <- need %in% rownames(installed.packages()) 
if(any(!have)) install.packages(need[!have]) 
invisible(lapply(need, library, character.only=T)) 

# Save the R script to the assignment 1 folder before this
# To set up the working directory
getwd()
setwd(getwd()) #change getwd() here is you need to set a different working directory

#this clears the workspace
rm(list = ls()) 
#this sets the random number generator seed to my birthday for replication

# Load the Wooldridge package
library(wooldridge)

# Load the k401k dataset
data('k401k')

# Display the first few rows of the dataset
head(k401k)

# Calculate the mean values for 'prate' and 'mrate'
mean_prate <- mean(k401k$prate)
mean_mrate <- mean(k401k$mrate)

# Print the mean values
cat("Mean prate:", mean_prate, "\n")
cat("Mean mrate:", mean_mrate, "\n\n")

# Run a regression of 'prate' on 'mrate'
model <- lm(prate ~ mrate, data = k401k)

# Display the coefficients (intercept and slope) of the regression
coefficients <- coef(model)
intercept <- coefficients[1]
slope <- coefficients[2]

# Print the intercept and slope
cat("Intercept:", intercept, "\n")
cat("Slope:", slope, "\n\n")

# Interpretation of the slope:
# For each unit increase in 'mrate', 'prate' is expected to increase/decrease by 'slope' units.

# Find the predicted 'prate' using 'mrate = 3.5'
new_mrate <- 3.5
predicted_prate <- predict(model, data.frame(mrate = new_mrate))

# Print the predicted 'prate'
cat("Predicted prate when mrate is 3.5:", predicted_prate, "\n")

ronuraltindag commented 1 year ago

Function form - LOG specifications

# Load the 'wage2' dataset from R's available datasets. This dataset contains information on individuals' wages and other attributes like IQ.
data("wage2")

# Display a summary statistics table for the 'wage2' dataset using the 'stargazer' package. The 'digits=2' argument formats the table to show numbers with two decimal places. The 'type="text"' argument specifies that the output should be in plain text format.
stargazer(wage2, digits=2, type="text")

# LEVEL-LEVEL MODEL
# This regression models the relationship between monthly wage (dependent variable) and IQ (independent variable) on a level-to-level basis.
# It implies that a one-point increase in IQ is associated with an $8.3 increase in monthly wage.
wage.reg1 <- lm(data=wage2, formula = wage ~ IQ)

# LOG-LEVEL MODEL
# First, this line creates a new variable in the 'wage2' dataset that is the natural logarithm of the wage variable. This transformation is common in economic analysis to help linearize relationships and interpret coefficients in percentage terms.
wage2$log.wage <- log(wage2$wage)
# This regression models the relationship between the log of wage (dependent variable) and IQ (independent variable).
# The interpretation is that a one-point increase in IQ is associated with a 0.9% increase in monthly wage, due to the log transformation of the dependent variable.
wage.reg2 <- lm(data=wage2, formula = log(wage) ~ IQ)

# LEVEL-LOG MODEL
# This regression models the relationship between monthly wage (dependent variable) and the natural logarithm of IQ (independent variable).
# The interpretation here involves elasticity. A 100% increase in IQ (which is a very large and mostly hypothetical change) is associated with a $778.6 increase in predicted monthly wage. For a more realistic change, a 1% increase in IQ is associated with a $7.786 increase in predicted wage. This shows how changes in the independent variable (IQ, in log form) relate to absolute changes in the dependent variable (wage).
wage.reg3 <- lm(data=wage2, formula = wage ~ log(IQ))

# LOG-LOG MODEL (Full Elasticity Estimation)
# This regression models the relationship between the log of wage (dependent variable) and the log of IQ (independent variable), making it a full elasticity model.
# The coefficient can be interpreted as the elasticity of the dependent variable with respect to the independent variable. Here, a 1% increase in IQ is associated with a 0.8% increase in predicted monthly wage. Similarly, a 10% increase in IQ would be associated with an 8% increase in predicted wage. This model allows us to understand proportional changes in both variables.
wage.reg4 <- lm(data=wage2, formula = log(wage) ~ log(IQ))

# Finally, this line uses 'stargazer' again to display a summary table for all four regression models created above. The 'digits=3' argument formats the table to show numbers with three decimal places, and 'type="text"' specifies the output format as plain text.
stargazer(wage.reg1, wage.reg2, wage.reg3, wage.reg4, type="text", digits=3)

ronuraltindag commented 1 year ago

https://www.dropbox.com/s/vffsxkrwm6qqm61/Lec4.docx?dl=0

ronuraltindag commented 1 year ago

# Generate data
set.seed(12334)
x <- rnorm(100000, mean = 10, sd = 2)
y <-  10 + 2*x + rnorm(100, mean = 0, sd = 1)

population.model <- lm(y~x)
stargazer::stargazer(population.model, type="text", digits = 2)

# Create empty vectors to store the slope coefficients
slopes <- rep(0, 10000)

# Run 10,000 regressions and store the slope coefficients
for (i in 1:10000) {
  samp_index <- sample(1:100000, 50, replace = TRUE)
  x_samp <- x[samp_index]
  y_samp <- y[samp_index]
  slopes[i] <- lm(y_samp ~ x_samp)$coefficients[2]
}

# Plot the distribution of slope coefficients
hist(slopes, breaks = 30, main = "Distribution of Slope Coefficients", 
     xlab = "Slope Coefficient", xlim = c(1.2, 2.8), col = "lightblue")

# Add a vertical line at the true slope value
abline(v = 2, lty = 2, col = "red")

# Add a normal curve to the histogram
curve(dnorm(x, mean = mean(slopes), sd = sd(slopes)), add = TRUE, col = "darkblue", lwd = 2)

library(wooldridge)
library(stargazer)

data('attend')

stargazer(attend, type="text")

mo1 <- lm(data=attend, final ~ missed + priGPA + ACT)

residuals <- residuals(mo1)
SSR <- sum(residuals^2)

sigma.hat.sq = SSR/676

SSTj <- sum((attend$missed - mean(attend$missed))^2)

#auxiliary reg 
mo.aux <- lm(data=attend, missed ~ priGPA + ACT)

stargazer(mo1, mo.aux, type="text")

r.sq.j <- as.numeric(summary(mo.aux)["r.squared"])

var.hat.b1 <- sigma.hat.sq/(SSTj*(1-r.sq.j))
se.hat.b1 <- sqrt(var.hat.b1)

ronuraltindag commented 5 months ago

https://www.dropbox.com/scl/fi/f89d3ej5aa7lchtju2bnj/Lec1.docx?rlkey=9z5iqqf1hai9gdxaktqfgzpto&dl=0

ronuraltindag commented 5 months ago

https://www.dropbox.com/scl/fi/wv0rsg3vampj6v4agr35n/Lec2.pdf?rlkey=ilrmeh0ba8h80suebyw9sl9pa&dl=0

ronuraltindag commented 2 months ago

# clear the workspace
rm(list=ls())
# Set the working directory to the directory where the currently running script is located.
# If using RStudio, you can set the working directory to the location of the currently open script.
if (interactive()) {
  setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}

# Load the RData file named "discrim_v1" into the R environment.
load("discrim_v1.RData")

# Run a linear regression with log(psoda) as the dependent variable and prpblck, log(income), prppov as independent variables.
mo1 <- lm(log(psoda) ~ prpblck + log(income) + prppov, 
data = Data)

stargazer::stargazer(mo1, type = "text", digits = 3)

cor(Data$prppov, log(Data$income), use = "complete.obs")

round(pt(0.136/0.028, df = 376, lower.tail = FALSE),4)
round(pt(0.392/0.137, df = 376, lower.tail = FALSE),4)

mo2 <- lm(log(psoda) ~ prpblck + log(income) + prppov + log(hseval), 
data = Data)

stargazer::stargazer(mo1,mo2, type = "text", digits = 3)
round(pt(0.118/0.018, df = 376, lower.tail = FALSE),4)

# Unrestricted model with all predictors
unrestricted_model <- lm(log(psoda) ~ prpblck + log(income) + prppov + log(hseval), data = Data)

# Restricted model without prpblck and prppov
restricted_model <- lm(log(psoda) ~ log(income) + log(hseval), data = Data)

# Extract R-squared from both models
R2_UR <- summary(unrestricted_model)$r.squared
R2_R <- summary(restricted_model)$r.squared

# Number of observations
n <- length(unrestricted_model$residuals)  # or use nrow(Data)

# Number of predictors in the unrestricted model, including the intercept
k <- length(unrestricted_model$coefficients)

# Calculate the F-statistic
p <- 2  # Number of parameters set to zero
F_value <- ((R2_UR - R2_R) / p) / ((1 - R2_UR) / (n - k - 1))

# Calculate the p-value
p_value <- pf(F_value, p, n - k - 1, lower.tail = FALSE)

print(F_value)
print(p_value)

# Comparing the unrestricted and restricted models to test the joint significance of prpblck and prppov
model_comparison <- anova(restricted_model, unrestricted_model)
print(model_comparison)

ronuraltindag / EC282

Classroom handouts #2