ronuraltindag / EC282

Introduction to Econometrics EC282
2 stars 0 forks source link

Classroom code #1

Open ronuraltindag opened 1 year ago

ronuraltindag commented 1 year ago

I will share some reference code here.

ronuraltindag commented 6 months ago
# Minimal R setup for empirical analysis
# Load the wooldridge package. This package contains datasets used in Wooldridge's econometrics textbooks.
need <- c('dplyr', 'ggplot2', 'wooldridge')
have <- need %in% rownames(installed.packages())
if(any(!have)) install.packages(need[!have])
invisible(lapply(need, library, character.only = TRUE))

# Clear the workspace
rm(list = ls())

# Set random number generator seed for reproducibility
set.seed(12345)

# Set options
options(scipen = 999)

# Load the wage2 dataset from the wooldridge package.
# The wage2 dataset is a cross-sectional dataset, meaning it contains data on multiple subjects at a single point in time.
data("wage2")

# View the first few rows of the dataset.
# This helps to get a quick overview of the data, including variable names and types of data (e.g., numeric, factor).
head(wage2)

# Summary of the dataset.
# This provides a more detailed view of the dataset, including statistics like mean, median, and range for numeric variables,
# and counts for categorical variables. It's useful for getting a sense of the data's distribution and structure.
summary(wage2)

# The AirPassengers dataset is a classic time series dataset in R, which contains
# the monthly totals of international airline passengers from 1949 to 1960.

# View the first few entries of the AirPassengers dataset
# This dataset is already in a time series format.
data("AirPassengers")
head(AirPassengers)

# Summary of the dataset
# This provides a quick overview of the dataset, including key statistics.
summary(AirPassengers)

# Check the structure of the dataset
# This shows the start and end times, as well as the frequency of the time series data.
str(AirPassengers)

# Plotting the time series data
# This helps to visualize trends, seasonality, and other patterns in the data over time.
plot(AirPassengers, xlab = "Year", ylab = "Number of Passengers",
     main = "Monthly International Airline Passengers (1949-1960)")

# Load the fertil1 dataset
data("fertil1")

# View the first few rows of the dataset
# This helps to understand the structure of the data and the variables included.
head(fertil1)

# Summary of the dataset
# Provides a statistical overview of the variables.
summary(fertil1)

# Explore the structure of the dataset
# This is to check variables and data types.
str(fertil1)

# If you want to perform some basic analysis to showcase the pooled nature, 
# you can compare some statistics across years.
# For instance, comparing average number of children born to women (children) across different years.
fertil1 %>%
  group_by(year) %>%
  summarize(average_children = mean(kids))

 # Load the wagepan dataset
data("wagepan")

# View the first few rows of the dataset
# This helps to understand the structure of the panel data, showing repeated observations for the same individuals over time.
head(wagepan)

# Summary of the dataset
# Provides a statistical overview of the variables.
summary(wagepan)

# Explore the structure of the dataset
# This is to check variables, data types, and the panel structure.
str(wagepan)

# Basic panel data analysis example: examining average wages over time for a subset of individuals
# Here we calculate the average wage for each year.

average_wages <- wagepan %>%
  mutate(wage = exp(lwage)) %>%
  group_by(year) %>%
  summarize(average_wage = mean(wage, na.rm = TRUE))

average_wages

# You can also examine changes in wages for specific individuals over time.
# Selecting an individual and examining their wage progression.
individual_id <- unique(wagepan$nr)[1]
individual_wage_data <- wagepan[wagepan$nr == individual_id, ]
individual_wage_data 
ronuraltindag commented 6 months ago
# Load necessary libraries
# 'dplyr' is used for data manipulation, 'ggplot2' for plotting, and 'wooldridge' contains datasets from Wooldridge's econometrics textbooks.
need <- c('dplyr', 'ggplot2', 'wooldridge')
have <- need %in% rownames(installed.packages())
# Install any of the packages if they are not already installed
if(any(!have)) install.packages(need[!have])
# Load the packages into R session
invisible(lapply(need, library, character.only = TRUE))

# Clear the workspace
# This removes all objects from the current workspace to avoid conflicts
rm(list = ls())

# Set the seed for random number generation
# This ensures reproducibility of results that involve random processes
set.seed(12345)

# Set scientific notation options
# Prevents R from converting numbers into scientific notation
options(scipen = 999)

# Load the 'card' dataset from the 'wooldridge' package
# This dataset is often used for econometric analysis
data("card")

# Data preparation and transformation
# Select relevant columns: id, education, IQ, and wage
# Create a binary variable 'BA' indicating whether education is 16 years or more
card <- card %>% 
        select(id, educ, IQ, wage) %>% 
        mutate(BA = as.numeric(educ >= 16))

# Summary statistics
# Group data by the 'BA' variable and calculate mean IQ and wage for each group
t1 <- card %>% 
      group_by(BA) %>% 
      summarise(mean_IQ = mean(IQ, na.rm = TRUE), 
                mean_wage = mean(wage, na.rm = TRUE))

# Print the summary table
print(t1)

# Load the Anscombe's quartet dataset and select the first set of variables
data("anscombe")
df <- anscombe[, c("x1", "y1")]
colnames(df) <- c("x", "y")

# Fit a linear model to the data
model <- lm(y ~ x, data = df)

# Calculate predicted values and residuals
df$predicted_y <- predict(model)
df$residuals <- residuals(model)

# Create the plot
ggplot_object <- ggplot(df, aes(x = x, y = y)) +
  # Plot the observed values
  geom_point(color = "blue", size = 4, shape = 1) +
  # Plot the predicted values
 geom_point(aes(y = predicted_y), color = "red", shape = 4, size= 4) +
  # Add lines for residuals
  geom_segment(aes(xend = x, yend = predicted_y), linetype = "dotted") +
  # Add a regression line
 geom_smooth(method = "lm", se = FALSE, color = "green", linetype = "solid") +
  # Labels and theme
  ggtitle("Residuals for Anscombe's Quartet (x1, y1)") +
  theme_minimal() +
  labs(x = "X", y = "Y")

#
print(ggplot_object)

# Define the function
plot_ols_with_residuals <- function(beta_0, beta_1) {
  # Load the first dataset of Anscombe's quartet
  data("anscombe")
  df <- anscombe[, c(1,5)]
  colnames(df) <- c("x", "y")

  # Compute predicted values
  df$predicted_y <- beta_0 + beta_1 * df$x

  # Calculate residuals
  df$residuals <- df$y - df$predicted_y

  # Compute sum of squared residuals
  ssr <- sum(df$residuals^2)

  # Create scatter plot with regression line and display SSR and regression formula
  ggplot(df, aes(x = x, y = y)) +
    geom_point() +
    geom_abline(intercept = beta_0, slope = beta_1, color = "blue") +
    ggtitle(paste("y =", beta_0, "+", beta_1, "x"," | SSR:", round(ssr, 2))) +
    theme_bw()
}
# Example usage of the function
plot_ols_with_residuals(beta_0 = 2, beta_1 = 0.7)
ronuraltindag commented 4 months ago
residuals_cigs_from_faminc <- resid(lm(cigs ~ faminc, data = bwght))###############################################################################
# list the packages we need and loads them, installs them automatically if we don't have them
# add any package that you need to the list  
need <- c('dplyr','ggplot2','tidyr',
         'httr','wooldridge','stargazer')

have <- need %in% rownames(installed.packages()) 
if(any(!have)) install.packages(need[!have]) 
invisible(lapply(need, library, character.only=T)) 

#Session -> set directory -> 
getwd()
setwd(getwd()) 

#this clears the workplace
rm(list = ls()) 
#this sets the random number generator seed to my birthday for replication
options(scipen=999)
###############################################################################
#see https://cran.r-project.org/web/packages/wooldridge/wooldridge.pdf
#get the data

data("bwght")

# Without including faminc
model_without_faminc <- lm(bwght ~ cigs, data = bwght)
# Including faminc
model_with_faminc <- lm(bwght ~ cigs + faminc, data = bwght)

stargazer(model_without_faminc, model_with_faminc, type = "text")
cor(bwght$cigs, bwght$faminc, use = "complete.obs")

# Without including faminc residuals 
residuals_bwght_from_faminc <- resid(lm(bwght ~ faminc, data = bwght))

residuals_cigs_from_faminc <- resid(lm(cigs ~ faminc, data = bwght))
model_partialled_out <- lm(residuals_bwght_from_faminc ~ residuals_cigs_from_faminc)

stargazer(model_without_faminc, model_with_faminc,model_partialled_out, type = "text")
ronuraltindag commented 4 months ago
###############################################################################
# list the packages we need and loads them, installs them automatically if we don't have them
# add any package that you need to the list  
need <- c('dplyr','ggplot2','tidyr','httr','wooldridge','stargazer')

have <- need %in% rownames(installed.packages()) 
if(any(!have)) install.packages(need[!have]) 
invisible(lapply(need, library, character.only=T)) 

#Session -> set directory -> 
getwd()
setwd(getwd()) 

#this clears the workplace
rm(list = ls()) 
#this sets the random number generator seed to my birthday for replication
options(scipen=999)
###############################################################################
#see https://cran.r-project.org/web/packages/wooldridge/wooldridge.pdf
#get the data
set.seed(123)
data("bwght")

mo1 <- lm(bwght ~ cigs + motheduc, data = bwght)

#add a random variable uniform between 10 and 20

bwght$randvar <- runif(nrow(bwght),10,20)
mo2 <- lm(bwght ~ cigs + motheduc + randvar, data = bwght)

stargazer(mo1, mo2, type = "text", title = "Regression Results", align = TRUE)

mo3 <- lm(bwght ~ cigs , data = bwght)
mo4 <- lm(motheduc ~ cigs, data = bwght)

stargazer(mo1, mo3, mo4, type = "text", title = "Regression Results", align = TRUE)

# b0_hat + B2hat + S0 
biased_b0 <- 115.445 +  0.331*13.114
biased_b1 <- -0.486 +  0.331*-0.085
print(c(biased_b0, biased_b1))

# Load necessary libraries
library(wooldridge) # for the dataset
library(car)       # for VIF calculation

# Load the dataset
data(wage1)

# Examine the structure of the dataset (optional)
str(wage1)

# Compute and print the correlation matrix for educ, exper, and tenure
correlation_matrix <- cor(wage1[, c("educ", "exper", "tenure")])
print("Correlation Matrix:")
print(correlation_matrix)

# Fit a linear model: wage as a function of education, experience, and tenure
lm_model <- lm(wage ~ educ + exper + tenure, data = wage1)

# Compute and print Variance Inflation Factors (VIF) for the predictors
vif_values <- vif(lm_model)
print("VIF Values:")
print(vif_values)

# Interpretation:
# - Inspect the correlation matrix for any high correlations between predictors.
# - Check the VIF values; values greater than 5 (or 10, more conservatively) indicate significant multicollinearity.