Classroom code - Githubissues

# Minimal R setup for empirical analysis
# Load the wooldridge package. This package contains datasets used in Wooldridge's econometrics textbooks.
need <- c('dplyr', 'ggplot2', 'wooldridge')
have <- need %in% rownames(installed.packages())
if(any(!have)) install.packages(need[!have])
invisible(lapply(need, library, character.only = TRUE))

# Clear the workspace
rm(list = ls())

# Set random number generator seed for reproducibility
set.seed(12345)

# Set options
options(scipen = 999)

# Load the wage2 dataset from the wooldridge package.
# The wage2 dataset is a cross-sectional dataset, meaning it contains data on multiple subjects at a single point in time.
data("wage2")

# View the first few rows of the dataset.
# This helps to get a quick overview of the data, including variable names and types of data (e.g., numeric, factor).
head(wage2)

# Summary of the dataset.
# This provides a more detailed view of the dataset, including statistics like mean, median, and range for numeric variables,
# and counts for categorical variables. It's useful for getting a sense of the data's distribution and structure.
summary(wage2)

# The AirPassengers dataset is a classic time series dataset in R, which contains
# the monthly totals of international airline passengers from 1949 to 1960.

# View the first few entries of the AirPassengers dataset
# This dataset is already in a time series format.
data("AirPassengers")
head(AirPassengers)

# Summary of the dataset
# This provides a quick overview of the dataset, including key statistics.
summary(AirPassengers)

# Check the structure of the dataset
# This shows the start and end times, as well as the frequency of the time series data.
str(AirPassengers)

# Plotting the time series data
# This helps to visualize trends, seasonality, and other patterns in the data over time.
plot(AirPassengers, xlab = "Year", ylab = "Number of Passengers",
     main = "Monthly International Airline Passengers (1949-1960)")

# Load the fertil1 dataset
data("fertil1")

# View the first few rows of the dataset
# This helps to understand the structure of the data and the variables included.
head(fertil1)

# Summary of the dataset
# Provides a statistical overview of the variables.
summary(fertil1)

# Explore the structure of the dataset
# This is to check variables and data types.
str(fertil1)

# If you want to perform some basic analysis to showcase the pooled nature, 
# you can compare some statistics across years.
# For instance, comparing average number of children born to women (children) across different years.
fertil1 %>%
  group_by(year) %>%
  summarize(average_children = mean(kids))

 # Load the wagepan dataset
data("wagepan")

# View the first few rows of the dataset
# This helps to understand the structure of the panel data, showing repeated observations for the same individuals over time.
head(wagepan)

# Summary of the dataset
# Provides a statistical overview of the variables.
summary(wagepan)

# Explore the structure of the dataset
# This is to check variables, data types, and the panel structure.
str(wagepan)

# Basic panel data analysis example: examining average wages over time for a subset of individuals
# Here we calculate the average wage for each year.

average_wages <- wagepan %>%
  mutate(wage = exp(lwage)) %>%
  group_by(year) %>%
  summarize(average_wage = mean(wage, na.rm = TRUE))

average_wages

# You can also examine changes in wages for specific individuals over time.
# Selecting an individual and examining their wage progression.
individual_id <- unique(wagepan$nr)[1]
individual_wage_data <- wagepan[wagepan$nr == individual_id, ]
individual_wage_data

# Load necessary libraries # 'dplyr' is used for data manipulation, 'ggplot2' for plotting, and 'wooldridge' contains datasets from Wooldridge's econometrics textbooks. need <- c('dplyr', 'ggplot2', 'wooldridge') have <- need %in% rownames(installed.packages()) # Install any of the packages if they are not already installed if(any(!have)) install.packages(need[!have]) # Load the packages into R session invisible(lapply(need, library, character.only = TRUE)) # Clear the workspace # This removes all objects from the current workspace to avoid conflicts rm(list = ls()) # Set the seed for random number generation # This ensures reproducibility of results that involve random processes set.seed(12345) # Set scientific notation options # Prevents R from converting numbers into scientific notation options(scipen = 999) # Load the 'card' dataset from the 'wooldridge' package # This dataset is often used for econometric analysis data("card") # Data preparation and transformation # Select relevant columns: id, education, IQ, and wage # Create a binary variable 'BA' indicating whether education is 16 years or more card <- card %>% select(id, educ, IQ, wage) %>% mutate(BA = as.numeric(educ >= 16)) # Summary statistics # Group data by the 'BA' variable and calculate mean IQ and wage for each group t1 <- card %>% group_by(BA) %>% summarise(mean_IQ = mean(IQ, na.rm = TRUE), mean_wage = mean(wage, na.rm = TRUE)) # Print the summary table print(t1) # Load the Anscombe's quartet dataset and select the first set of variables data("anscombe") df <- anscombe[, c("x1", "y1")] colnames(df) <- c("x", "y") # Fit a linear model to the data model <- lm(y ~ x, data = df) # Calculate predicted values and residuals df$predicted_y <- predict(model) df$residuals <- residuals(model) # Create the plot ggplot_object <- ggplot(df, aes(x = x, y = y)) + # Plot the observed values geom_point(color = "blue", size = 4, shape = 1) + # Plot the predicted values geom_point(aes(y = predicted_y), color = "red", shape = 4, size= 4) + # Add lines for residuals geom_segment(aes(xend = x, yend = predicted_y), linetype = "dotted") + # Add a regression line geom_smooth(method = "lm", se = FALSE, color = "green", linetype = "solid") + # Labels and theme ggtitle("Residuals for Anscombe's Quartet (x1, y1)") + theme_minimal() + labs(x = "X", y = "Y") # print(ggplot_object) # Define the function plot_ols_with_residuals <- function(beta_0, beta_1) { # Load the first dataset of Anscombe's quartet data("anscombe") df <- anscombe[, c(1,5)] colnames(df) <- c("x", "y") # Compute predicted values df$predicted_y <- beta_0 + beta_1 * df$x # Calculate residuals df$residuals <- df$y - df$predicted_y # Compute sum of squared residuals ssr <- sum(df$residuals^2) # Create scatter plot with regression line and display SSR and regression formula ggplot(df, aes(x = x, y = y)) + geom_point() + geom_abline(intercept = beta_0, slope = beta_1, color = "blue") + ggtitle(paste("y =", beta_0, "+", beta_1, "x"," | SSR:", round(ssr, 2))) + theme_bw() } # Example usage of the function plot_ols_with_residuals(beta_0 = 2, beta_1 = 0.7)

residuals_cigs_from_faminc <- resid(lm(cigs ~ faminc, data = bwght))############################################################################### # list the packages we need and loads them, installs them automatically if we don't have them # add any package that you need to the list need <- c('dplyr','ggplot2','tidyr', 'httr','wooldridge','stargazer') have <- need %in% rownames(installed.packages()) if(any(!have)) install.packages(need[!have]) invisible(lapply(need, library, character.only=T)) #Session -> set directory -> getwd() setwd(getwd()) #this clears the workplace rm(list = ls()) #this sets the random number generator seed to my birthday for replication options(scipen=999) ############################################################################### #see https://cran.r-project.org/web/packages/wooldridge/wooldridge.pdf #get the data data("bwght") # Without including faminc model_without_faminc <- lm(bwght ~ cigs, data = bwght) # Including faminc model_with_faminc <- lm(bwght ~ cigs + faminc, data = bwght) stargazer(model_without_faminc, model_with_faminc, type = "text") cor(bwght$cigs, bwght$faminc, use = "complete.obs") # Without including faminc residuals residuals_bwght_from_faminc <- resid(lm(bwght ~ faminc, data = bwght)) residuals_cigs_from_faminc <- resid(lm(cigs ~ faminc, data = bwght)) model_partialled_out <- lm(residuals_bwght_from_faminc ~ residuals_cigs_from_faminc) stargazer(model_without_faminc, model_with_faminc,model_partialled_out, type = "text")

############################################################################### # list the packages we need and loads them, installs them automatically if we don't have them # add any package that you need to the list need <- c('dplyr','ggplot2','tidyr','httr','wooldridge','stargazer') have <- need %in% rownames(installed.packages()) if(any(!have)) install.packages(need[!have]) invisible(lapply(need, library, character.only=T)) #Session -> set directory -> getwd() setwd(getwd()) #this clears the workplace rm(list = ls()) #this sets the random number generator seed to my birthday for replication options(scipen=999) ############################################################################### #see https://cran.r-project.org/web/packages/wooldridge/wooldridge.pdf #get the data set.seed(123) data("bwght") mo1 <- lm(bwght ~ cigs + motheduc, data = bwght) #add a random variable uniform between 10 and 20 bwght$randvar <- runif(nrow(bwght),10,20) mo2 <- lm(bwght ~ cigs + motheduc + randvar, data = bwght) stargazer(mo1, mo2, type = "text", title = "Regression Results", align = TRUE) mo3 <- lm(bwght ~ cigs , data = bwght) mo4 <- lm(motheduc ~ cigs, data = bwght) stargazer(mo1, mo3, mo4, type = "text", title = "Regression Results", align = TRUE) # b0_hat + B2hat + S0 biased_b0 <- 115.445 + 0.331*13.114 biased_b1 <- -0.486 + 0.331*-0.085 print(c(biased_b0, biased_b1)) # Load necessary libraries library(wooldridge) # for the dataset library(car) # for VIF calculation # Load the dataset data(wage1) # Examine the structure of the dataset (optional) str(wage1) # Compute and print the correlation matrix for educ, exper, and tenure correlation_matrix <- cor(wage1[, c("educ", "exper", "tenure")]) print("Correlation Matrix:") print(correlation_matrix) # Fit a linear model: wage as a function of education, experience, and tenure lm_model <- lm(wage ~ educ + exper + tenure, data = wage1) # Compute and print Variance Inflation Factors (VIF) for the predictors vif_values <- vif(lm_model) print("VIF Values:") print(vif_values) # Interpretation: # - Inspect the correlation matrix for any high correlations between predictors. # - Check the VIF values; values greater than 5 (or 10, more conservatively) indicate significant multicollinearity.

ronuraltindag / EC282

Classroom code #1