Open s3alfisc opened 6 days ago
Implementation sketch:
library(etwfe)
library(data.table)
data("mpdta", package = "did")
fwrite(mpdta, "C:/Users/alexa/Documents/data/mpdta.csv")
head(mpdta)
data = mpdta
mod =
etwfe(
fml = lemp ~ 1, # outcome ~ controls
tvar = year, # time variable
gvar = first.treat, # group variable
data = data, # dataset
vcov = ~countyreal, # vcov adjustment (here: clustered)
)
summary(mod)
# OLS estimation, Dep. Var.: lemp
# Observations: 2,500
# Fixed-effects: first.treat: 4, year: 5
# Standard-errors: Clustered (countyreal)
# Estimate Std. Error t value Pr(>|t|)
# .Dtreat:first.treat::2004:year::2004 -0.010503 0.023363 -0.449562 0.65322178
# .Dtreat:first.treat::2004:year::2005 -0.070423 0.031134 -2.261910 0.02413241 *
# .Dtreat:first.treat::2004:year::2006 -0.137259 0.036612 -3.749051 0.00019829 ***
# .Dtreat:first.treat::2004:year::2007 -0.100811 0.034525 -2.919941 0.00365931 **
# .Dtreat:first.treat::2006:year::2004 0.006520 0.023439 0.278168 0.78099831
# .Dtreat:first.treat::2006:year::2005 0.003769 0.031493 0.119685 0.90478060
# .Dtreat:first.treat::2006:year::2006 -0.000825 0.033799 -0.024418 0.98052868
# .Dtreat:first.treat::2006:year::2007 -0.037455 0.035897 -1.043403 0.29726681
# .Dtreat:first.treat::2007:year::2004 0.030507 0.015106 2.019485 0.04397085 *
# .Dtreat:first.treat::2007:year::2005 0.027781 0.019638 1.414614 0.15780542
# .Dtreat:first.treat::2007:year::2006 -0.003306 0.024570 -0.134569 0.89300678
# .Dtreat:first.treat::2007:year::2007 -0.029361 0.026561 -1.105397 0.26952021
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RMSE: 1.48661 Adj. R2: 0.021348
# Within R2: 8.696e-5
mod |> coef() |> sort() |> unname()
mod |> coef() |> sort() |> unname()
# [1] -0.136078114 -0.104707472 -0.078319099 -0.043106033 -0.039192736
# [6] -0.019372364 0.002513862
import pandas as pd
import numpy as np
import pyfixest as pf
data = pd.read_csv("C:/Users/alexa/Documents/data/mpdta.csv")
data.rename(columns = {"first.treat":"first_treat"}, inplace = True)
tvar = "year"
gvar = "first_treat"
# Unique values
ug = data[gvar].unique()
ut = data[tvar].unique()
# Determine gref
gref = ug[ug > max(ut)]
if len(gref) == 0:
gref = ug[ug < min(ut)]
gref = gref[0]
gref_min_flag = True
# Construct rhs (right-hand side)
rhs = ""
rhs = f".Dtreat : {rhs}"
ref_string = f", ref = {gref}"
rhs = f"{rhs}i({gvar}, {tvar}{ref_string})"
data[".Dtreat"] = (data[tvar] >= data[gvar]) & (data[gvar] != gref)
if not gref_min_flag:
data[".Dtreat"] = np.where(data[tvar] < gref, data[".Dtreat"], np.nan)
else:
data[".Dtreat"] = np.where(data[tvar] > gref, data[".Dtreat"], np.nan)
# turn tvar into categorical
data[gvar] = pd.Categorical(data[gvar])
lhs = "lemp"
fixef = "first_treat + year"
fml = f"{lhs} ~ {rhs} | {fixef}"
fit = pf.feols(fml, data = data)
fit.summary()
# raise ValueError(
#ValueError:
# The second variable in the i() syntax cannot be of type #"category" or "object", but
# but it is of type category.
Problem: interacting two categoricals via i()
currently leads to the error above.
Alternative: don't use i() syntax and go with C() interactions directly.
rhs = ""
rhs = f".Dtreat : {rhs}"
rhs2 = f"{rhs}C({gvar}, contr.treatment({gref})):C({tvar})"
lhs = "lemp"
fixef = "first_treat + year"
fml2 = f"{lhs} ~ {rhs2} | {fixef}"
fit2 = pf.feols(fml2, data = data)
np.sort(fit2.coef().values)
#array([-0.13607811, -0.10470747, -0.0783191 , -0.04310603, -0.03919274,
# -0.01937236, 0.00251386])
Matches the etfwe
output above.
Basic implementation without covariates:
import pandas as pd
import numpy as np
import pyfixest as pf
def etwfe(fml, tvar, gvar, data, cgroup = "notyet"):
fml = fml.replace(" ", "")
fml_split = fml.split("~")
lhs = fml_split[0]
#if fml_split["1"] != "1":
# raise NotImplementedError("Covariates are not yet implemented.")
rhs = fml_split[1] if fml_split[1] != "1" else ""
if cgroup not in ["notyet"]:
raise NotImplementedError(f"cgroup must be 'notyet' but is {cgroup}.")
ug = data[gvar].unique()
ut = data[tvar].unique()
gref = ug[ug > max(ut)]
if gref.size == 0:
gref = ug[ug < min(ut)]
gref = gref[0]
gref_min_flag = True
#rhs = ""
#rhs = f".Dtreat : {rhs}"
#rhs = f"{rhs}C({gvar}, contr.treatment({gref})):C({tvar})"
#fixef = f"{tvar} + {gvar}"
#fml = f"{lhs} ~ {rhs} | {fixef}"
rhs = ""
rhs = f".Dtreat : {rhs}"
rhs = f"{rhs}C({gvar}, contr.treatment({gref})):C({tvar})"
lhs = "lemp"
fixef = "first_treat + year"
fml = f"{lhs} ~ {rhs} | {fixef}"
data[".Dtreat"] = (data[tvar] >= data[gvar]) & (data[gvar] != gref)
if not gref_min_flag:
data[".Dtreat"] = np.where(data[tvar] < gref, data[".Dtreat"], np.nan)
else:
data[".Dtreat"] = np.where(data[tvar] > gref, data[".Dtreat"], np.nan)
print("fml", fml)
fit = pf.feols(fml, data = data)
return fit
data = pd.read_csv("C:/Users/alexa/Documents/data/mpdta.csv")
data.rename(columns = {"first.treat":"first_treat"}, inplace = True)
fit = etwfe(fml = "lpop ~ 1", tvar = "year", gvar = "first_treat", data = data)
fit.coef().sort_values()
#Coefficient
#.Dtreat:C(first_treat, contr.treatment(0))[T.2004]:C(year)[T.2006] -0.136078
#.Dtreat:C(first_treat, contr.treatment(0))[T.2004]:C(year)[T.2007] -0.104707
#.Dtreat:C(first_treat, contr.treatment(0))[T.2004]:C(year)[T.2005] -0.078319
#.Dtreat:C(first_treat, contr.treatment(0))[T.2007]:C(year)[T.2007] -0.043106
#.Dtreat:C(first_treat, contr.treatment(0))[T.2006]:C(year)[T.2007] -0.039193
#.Dtreat:C(first_treat, contr.treatment(0))[T.2004]:C(year)[T.2004] -0.019372
#.Dtreat:C(first_treat, contr.treatment(0))[T.2006]:C(year)[T.2006] 0.002514
#Name: Estimate, dtype: float64
Next steps:
emfx
cgroup = "never"
, gref
, etcemfx
would look something like this:
from marginaleffects import slopes
def emfx(fit, data, tvar, gvar):
predict = "response" # could also be link
wts = data.groupby([tvar, gvar]).size()
wts.name = "N"
wts = wts.reset_index()
data = data.merge(wts, on = [tvar, gvar], how = "left")
by_var = ".Dtreat"
return slopes(
fit,
newdata = data,
wts = "N",
variables = ".Dtreat",
by = by_var,
)
See the R emfx function.
this would be an uncompressed version of the mundlak regression right?
this would be an uncompressed version of the mundlak regression right?
You mean these two?
Implement an extended 2wfe estimator a la Wooldridge 2021 following the etwfe R package.
Some Background
See the Stata docs for an intro to the estimator without covariates (bottom of page 18): link or Wooldridge's paper linked above:
This is also explained in @vincentarelbundock 's blog post. Basically, we fit
With covariates, we instead get
with
Note that for the residualizing of covariates X by cohort, we can use pyfixest's demean function as
To do
Implement an API similar to R's
etwfe()
function.Marginal effects can be computed via py-marginaleffects.