Open ronuraltindag opened 1 year ago
# Minimal R setup for empirical analysis
# Load the wooldridge package. This package contains datasets used in Wooldridge's econometrics textbooks.
need <- c('dplyr', 'ggplot2', 'wooldridge')
have <- need %in% rownames(installed.packages())
if(any(!have)) install.packages(need[!have])
invisible(lapply(need, library, character.only = TRUE))
# Clear the workspace
rm(list = ls())
# Set random number generator seed for reproducibility
set.seed(12345)
# Set options
options(scipen = 999)
# Load the wage2 dataset from the wooldridge package.
# The wage2 dataset is a cross-sectional dataset, meaning it contains data on multiple subjects at a single point in time.
data("wage2")
# View the first few rows of the dataset.
# This helps to get a quick overview of the data, including variable names and types of data (e.g., numeric, factor).
head(wage2)
# Summary of the dataset.
# This provides a more detailed view of the dataset, including statistics like mean, median, and range for numeric variables,
# and counts for categorical variables. It's useful for getting a sense of the data's distribution and structure.
summary(wage2)
# The AirPassengers dataset is a classic time series dataset in R, which contains
# the monthly totals of international airline passengers from 1949 to 1960.
# View the first few entries of the AirPassengers dataset
# This dataset is already in a time series format.
data("AirPassengers")
head(AirPassengers)
# Summary of the dataset
# This provides a quick overview of the dataset, including key statistics.
summary(AirPassengers)
# Check the structure of the dataset
# This shows the start and end times, as well as the frequency of the time series data.
str(AirPassengers)
# Plotting the time series data
# This helps to visualize trends, seasonality, and other patterns in the data over time.
plot(AirPassengers, xlab = "Year", ylab = "Number of Passengers",
main = "Monthly International Airline Passengers (1949-1960)")
# Load the fertil1 dataset
data("fertil1")
# View the first few rows of the dataset
# This helps to understand the structure of the data and the variables included.
head(fertil1)
# Summary of the dataset
# Provides a statistical overview of the variables.
summary(fertil1)
# Explore the structure of the dataset
# This is to check variables and data types.
str(fertil1)
# If you want to perform some basic analysis to showcase the pooled nature,
# you can compare some statistics across years.
# For instance, comparing average number of children born to women (children) across different years.
fertil1 %>%
group_by(year) %>%
summarize(average_children = mean(kids))
# Load the wagepan dataset
data("wagepan")
# View the first few rows of the dataset
# This helps to understand the structure of the panel data, showing repeated observations for the same individuals over time.
head(wagepan)
# Summary of the dataset
# Provides a statistical overview of the variables.
summary(wagepan)
# Explore the structure of the dataset
# This is to check variables, data types, and the panel structure.
str(wagepan)
# Basic panel data analysis example: examining average wages over time for a subset of individuals
# Here we calculate the average wage for each year.
average_wages <- wagepan %>%
mutate(wage = exp(lwage)) %>%
group_by(year) %>%
summarize(average_wage = mean(wage, na.rm = TRUE))
average_wages
# You can also examine changes in wages for specific individuals over time.
# Selecting an individual and examining their wage progression.
individual_id <- unique(wagepan$nr)[1]
individual_wage_data <- wagepan[wagepan$nr == individual_id, ]
individual_wage_data
# Load necessary libraries
# 'dplyr' is used for data manipulation, 'ggplot2' for plotting, and 'wooldridge' contains datasets from Wooldridge's econometrics textbooks.
need <- c('dplyr', 'ggplot2', 'wooldridge')
have <- need %in% rownames(installed.packages())
# Install any of the packages if they are not already installed
if(any(!have)) install.packages(need[!have])
# Load the packages into R session
invisible(lapply(need, library, character.only = TRUE))
# Clear the workspace
# This removes all objects from the current workspace to avoid conflicts
rm(list = ls())
# Set the seed for random number generation
# This ensures reproducibility of results that involve random processes
set.seed(12345)
# Set scientific notation options
# Prevents R from converting numbers into scientific notation
options(scipen = 999)
# Load the 'card' dataset from the 'wooldridge' package
# This dataset is often used for econometric analysis
data("card")
# Data preparation and transformation
# Select relevant columns: id, education, IQ, and wage
# Create a binary variable 'BA' indicating whether education is 16 years or more
card <- card %>%
select(id, educ, IQ, wage) %>%
mutate(BA = as.numeric(educ >= 16))
# Summary statistics
# Group data by the 'BA' variable and calculate mean IQ and wage for each group
t1 <- card %>%
group_by(BA) %>%
summarise(mean_IQ = mean(IQ, na.rm = TRUE),
mean_wage = mean(wage, na.rm = TRUE))
# Print the summary table
print(t1)
# Load the Anscombe's quartet dataset and select the first set of variables
data("anscombe")
df <- anscombe[, c("x1", "y1")]
colnames(df) <- c("x", "y")
# Fit a linear model to the data
model <- lm(y ~ x, data = df)
# Calculate predicted values and residuals
df$predicted_y <- predict(model)
df$residuals <- residuals(model)
# Create the plot
ggplot_object <- ggplot(df, aes(x = x, y = y)) +
# Plot the observed values
geom_point(color = "blue", size = 4, shape = 1) +
# Plot the predicted values
geom_point(aes(y = predicted_y), color = "red", shape = 4, size= 4) +
# Add lines for residuals
geom_segment(aes(xend = x, yend = predicted_y), linetype = "dotted") +
# Add a regression line
geom_smooth(method = "lm", se = FALSE, color = "green", linetype = "solid") +
# Labels and theme
ggtitle("Residuals for Anscombe's Quartet (x1, y1)") +
theme_minimal() +
labs(x = "X", y = "Y")
#
print(ggplot_object)
# Define the function
plot_ols_with_residuals <- function(beta_0, beta_1) {
# Load the first dataset of Anscombe's quartet
data("anscombe")
df <- anscombe[, c(1,5)]
colnames(df) <- c("x", "y")
# Compute predicted values
df$predicted_y <- beta_0 + beta_1 * df$x
# Calculate residuals
df$residuals <- df$y - df$predicted_y
# Compute sum of squared residuals
ssr <- sum(df$residuals^2)
# Create scatter plot with regression line and display SSR and regression formula
ggplot(df, aes(x = x, y = y)) +
geom_point() +
geom_abline(intercept = beta_0, slope = beta_1, color = "blue") +
ggtitle(paste("y =", beta_0, "+", beta_1, "x"," | SSR:", round(ssr, 2))) +
theme_bw()
}
# Example usage of the function
plot_ols_with_residuals(beta_0 = 2, beta_1 = 0.7)
residuals_cigs_from_faminc <- resid(lm(cigs ~ faminc, data = bwght))###############################################################################
# list the packages we need and loads them, installs them automatically if we don't have them
# add any package that you need to the list
need <- c('dplyr','ggplot2','tidyr',
'httr','wooldridge','stargazer')
have <- need %in% rownames(installed.packages())
if(any(!have)) install.packages(need[!have])
invisible(lapply(need, library, character.only=T))
#Session -> set directory ->
getwd()
setwd(getwd())
#this clears the workplace
rm(list = ls())
#this sets the random number generator seed to my birthday for replication
options(scipen=999)
###############################################################################
#see https://cran.r-project.org/web/packages/wooldridge/wooldridge.pdf
#get the data
data("bwght")
# Without including faminc
model_without_faminc <- lm(bwght ~ cigs, data = bwght)
# Including faminc
model_with_faminc <- lm(bwght ~ cigs + faminc, data = bwght)
stargazer(model_without_faminc, model_with_faminc, type = "text")
cor(bwght$cigs, bwght$faminc, use = "complete.obs")
# Without including faminc residuals
residuals_bwght_from_faminc <- resid(lm(bwght ~ faminc, data = bwght))
residuals_cigs_from_faminc <- resid(lm(cigs ~ faminc, data = bwght))
model_partialled_out <- lm(residuals_bwght_from_faminc ~ residuals_cigs_from_faminc)
stargazer(model_without_faminc, model_with_faminc,model_partialled_out, type = "text")
###############################################################################
# list the packages we need and loads them, installs them automatically if we don't have them
# add any package that you need to the list
need <- c('dplyr','ggplot2','tidyr','httr','wooldridge','stargazer')
have <- need %in% rownames(installed.packages())
if(any(!have)) install.packages(need[!have])
invisible(lapply(need, library, character.only=T))
#Session -> set directory ->
getwd()
setwd(getwd())
#this clears the workplace
rm(list = ls())
#this sets the random number generator seed to my birthday for replication
options(scipen=999)
###############################################################################
#see https://cran.r-project.org/web/packages/wooldridge/wooldridge.pdf
#get the data
set.seed(123)
data("bwght")
mo1 <- lm(bwght ~ cigs + motheduc, data = bwght)
#add a random variable uniform between 10 and 20
bwght$randvar <- runif(nrow(bwght),10,20)
mo2 <- lm(bwght ~ cigs + motheduc + randvar, data = bwght)
stargazer(mo1, mo2, type = "text", title = "Regression Results", align = TRUE)
mo3 <- lm(bwght ~ cigs , data = bwght)
mo4 <- lm(motheduc ~ cigs, data = bwght)
stargazer(mo1, mo3, mo4, type = "text", title = "Regression Results", align = TRUE)
# b0_hat + B2hat + S0
biased_b0 <- 115.445 + 0.331*13.114
biased_b1 <- -0.486 + 0.331*-0.085
print(c(biased_b0, biased_b1))
# Load necessary libraries
library(wooldridge) # for the dataset
library(car) # for VIF calculation
# Load the dataset
data(wage1)
# Examine the structure of the dataset (optional)
str(wage1)
# Compute and print the correlation matrix for educ, exper, and tenure
correlation_matrix <- cor(wage1[, c("educ", "exper", "tenure")])
print("Correlation Matrix:")
print(correlation_matrix)
# Fit a linear model: wage as a function of education, experience, and tenure
lm_model <- lm(wage ~ educ + exper + tenure, data = wage1)
# Compute and print Variance Inflation Factors (VIF) for the predictors
vif_values <- vif(lm_model)
print("VIF Values:")
print(vif_values)
# Interpretation:
# - Inspect the correlation matrix for any high correlations between predictors.
# - Check the VIF values; values greater than 5 (or 10, more conservatively) indicate significant multicollinearity.
I will share some reference code here.