py-econometrics / pyfixest

Fast High-Dimensional Fixed Effects Regression in Python following fixest-syntax
https://py-econometrics.github.io/pyfixest/
MIT License
176 stars 35 forks source link

Extended TWFE a la Wooldridge (2021) #718

Open s3alfisc opened 6 days ago

s3alfisc commented 6 days ago

Implement an extended 2wfe estimator a la Wooldridge 2021 following the etwfe R package.

Some Background

See the Stata docs for an intro to the estimator without covariates (bottom of page 18): link or Wooldridge's paper linked above: image

This is also explained in @vincentarelbundock 's blog post. Basically, we fit

etwfe = feols(y ~ treat : factor(time_to_treat) : factor(cohort) | firm + year, data = dat)

With covariates, we instead get

image

with

image

Note that for the residualizing of covariates X by cohort, we can use pyfixest's demean function as

from pyfixest.estimation.demean_ import demean
X = X - demean(X, cohort) 

To do

Implement an API similar to R's etwfe() function.

mod =
  etwfe(
    fml  = lemp ~ lpop, # outcome ~ controls
    tvar = year,        # time variable
    gvar = first.treat, # group variable
    data = mpdta,       # dataset
    vcov = ~countyreal  # vcov adjustment (here: clustered)
    )

Marginal effects can be computed via py-marginaleffects.

s3alfisc commented 6 days ago

Implementation sketch:

library(etwfe)
library(data.table)

data("mpdta", package = "did")
fwrite(mpdta, "C:/Users/alexa/Documents/data/mpdta.csv")

head(mpdta)

data = mpdta

mod =
  etwfe(
    fml  = lemp ~ 1, # outcome ~ controls
    tvar = year,        # time variable
    gvar = first.treat, # group variable
    data = data,       # dataset
    vcov = ~countyreal,  # vcov adjustment (here: clustered)
  )

summary(mod)

# OLS estimation, Dep. Var.: lemp
# Observations: 2,500
# Fixed-effects: first.treat: 4,  year: 5
# Standard-errors: Clustered (countyreal) 
#                                       Estimate Std. Error   t value   Pr(>|t|)    
# .Dtreat:first.treat::2004:year::2004 -0.010503   0.023363 -0.449562 0.65322178    
# .Dtreat:first.treat::2004:year::2005 -0.070423   0.031134 -2.261910 0.02413241 *  
# .Dtreat:first.treat::2004:year::2006 -0.137259   0.036612 -3.749051 0.00019829 ***
# .Dtreat:first.treat::2004:year::2007 -0.100811   0.034525 -2.919941 0.00365931 ** 
# .Dtreat:first.treat::2006:year::2004  0.006520   0.023439  0.278168 0.78099831    
# .Dtreat:first.treat::2006:year::2005  0.003769   0.031493  0.119685 0.90478060    
# .Dtreat:first.treat::2006:year::2006 -0.000825   0.033799 -0.024418 0.98052868    
# .Dtreat:first.treat::2006:year::2007 -0.037455   0.035897 -1.043403 0.29726681    
# .Dtreat:first.treat::2007:year::2004  0.030507   0.015106  2.019485 0.04397085 *  
# .Dtreat:first.treat::2007:year::2005  0.027781   0.019638  1.414614 0.15780542    
# .Dtreat:first.treat::2007:year::2006 -0.003306   0.024570 -0.134569 0.89300678    
# .Dtreat:first.treat::2007:year::2007 -0.029361   0.026561 -1.105397 0.26952021    
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RMSE: 1.48661     Adj. R2: 0.021348
#                 Within R2: 8.696e-5

mod |> coef() |> sort() |> unname()
mod |> coef() |> sort() |> unname()
# [1] -0.136078114 -0.104707472 -0.078319099 -0.043106033 -0.039192736
# [6] -0.019372364  0.002513862
import pandas as pd 
import numpy as np
import pyfixest as pf

data = pd.read_csv("C:/Users/alexa/Documents/data/mpdta.csv")
data.rename(columns = {"first.treat":"first_treat"}, inplace = True)

tvar = "year"
gvar = "first_treat"

# Unique values
ug = data[gvar].unique()
ut = data[tvar].unique()

# Determine gref
gref = ug[ug > max(ut)]
if len(gref) == 0:
    gref = ug[ug < min(ut)]
gref = gref[0]  
gref_min_flag = True

# Construct rhs (right-hand side)
rhs = ""
rhs = f".Dtreat : {rhs}"
ref_string = f", ref = {gref}"
rhs = f"{rhs}i({gvar}, {tvar}{ref_string})"

data[".Dtreat"] = (data[tvar] >= data[gvar]) & (data[gvar] != gref)
if not gref_min_flag:
    data[".Dtreat"] = np.where(data[tvar] < gref, data[".Dtreat"], np.nan)
else:
    data[".Dtreat"] = np.where(data[tvar] > gref, data[".Dtreat"], np.nan)

# turn tvar into categorical
data[gvar] = pd.Categorical(data[gvar])

lhs = "lemp"
fixef = "first_treat + year"
fml = f"{lhs} ~ {rhs} | {fixef}"

fit = pf.feols(fml, data = data)
fit.summary()

#    raise ValueError(
#ValueError: 
#                        The second variable in the i() syntax cannot be of type #"category" or "object", but
#                        but it is of type category.

Problem: interacting two categoricals via i() currently leads to the error above.

s3alfisc commented 6 days ago

Alternative: don't use i() syntax and go with C() interactions directly.

rhs = ""
rhs = f".Dtreat : {rhs}"
rhs2 = f"{rhs}C({gvar}, contr.treatment({gref})):C({tvar})"
lhs = "lemp"
fixef = "first_treat + year"
fml2 = f"{lhs} ~ {rhs2} | {fixef}"

fit2 = pf.feols(fml2, data = data)
np.sort(fit2.coef().values)
#array([-0.13607811, -0.10470747, -0.0783191 , -0.04310603, -0.03919274,
#       -0.01937236,  0.00251386])

Matches the etfwe output above.

s3alfisc commented 6 days ago

Basic implementation without covariates:

import pandas as pd 
import numpy as np
import pyfixest as pf

def etwfe(fml, tvar, gvar, data, cgroup = "notyet"):

    fml = fml.replace(" ", "")
    fml_split = fml.split("~")
    lhs = fml_split[0]

    #if fml_split["1"] != "1": 
    #  raise NotImplementedError("Covariates are not yet implemented.")

    rhs = fml_split[1] if fml_split[1] != "1" else ""

    if cgroup not in ["notyet"]: 
        raise NotImplementedError(f"cgroup must be 'notyet' but is {cgroup}.")

    ug = data[gvar].unique()
    ut = data[tvar].unique()

    gref = ug[ug > max(ut)]
    if gref.size == 0:
        gref = ug[ug < min(ut)]
    gref = gref[0]  
    gref_min_flag = True

    #rhs = ""
    #rhs = f".Dtreat : {rhs}"
    #rhs = f"{rhs}C({gvar}, contr.treatment({gref})):C({tvar})"

    #fixef = f"{tvar} + {gvar}"
    #fml = f"{lhs} ~ {rhs} | {fixef}"

    rhs = ""
    rhs = f".Dtreat : {rhs}"
    rhs = f"{rhs}C({gvar}, contr.treatment({gref})):C({tvar})"
    lhs = "lemp"
    fixef = "first_treat + year"
    fml = f"{lhs} ~ {rhs} | {fixef}"

    data[".Dtreat"] = (data[tvar] >= data[gvar]) & (data[gvar] != gref)
    if not gref_min_flag:
        data[".Dtreat"] = np.where(data[tvar] < gref, data[".Dtreat"], np.nan)
    else:
        data[".Dtreat"] = np.where(data[tvar] > gref, data[".Dtreat"], np.nan)

    print("fml", fml)
    fit = pf.feols(fml, data = data)

    return fit

data = pd.read_csv("C:/Users/alexa/Documents/data/mpdta.csv")
data.rename(columns = {"first.treat":"first_treat"}, inplace = True)

fit = etwfe(fml = "lpop ~ 1", tvar = "year", gvar = "first_treat", data = data)
fit.coef().sort_values()

#Coefficient
#.Dtreat:C(first_treat, contr.treatment(0))[T.2004]:C(year)[T.2006]   -0.136078
#.Dtreat:C(first_treat, contr.treatment(0))[T.2004]:C(year)[T.2007]   -0.104707
#.Dtreat:C(first_treat, contr.treatment(0))[T.2004]:C(year)[T.2005]   -0.078319
#.Dtreat:C(first_treat, contr.treatment(0))[T.2007]:C(year)[T.2007]   -0.043106
#.Dtreat:C(first_treat, contr.treatment(0))[T.2006]:C(year)[T.2007]   -0.039193
#.Dtreat:C(first_treat, contr.treatment(0))[T.2004]:C(year)[T.2004]   -0.019372
#.Dtreat:C(first_treat, contr.treatment(0))[T.2006]:C(year)[T.2006]    0.002514
#Name: Estimate, dtype: float64

Next steps:

s3alfisc commented 6 days ago

emfx would look something like this:

from marginaleffects import slopes  
def emfx(fit, data, tvar, gvar): 

  predict = "response" # could also be link
  wts = data.groupby([tvar, gvar]).size()
  wts.name = "N"
  wts = wts.reset_index()

  data = data.merge(wts, on = [tvar, gvar], how = "left")

  by_var = ".Dtreat"

  return slopes(
      fit,
      newdata = data,   
      wts = "N",
      variables = ".Dtreat",
      by = by_var,
    )

See the R emfx function.

apoorvalal commented 1 hour ago

this would be an uncompressed version of the mundlak regression right?

s3alfisc commented 22 minutes ago

this would be an uncompressed version of the mundlak regression right?

You mean these two?

image