grantmcdermott / etwfe

Extended two-way fixed effects
https://grantmcdermott.com/etwfe/
Other
50 stars 11 forks source link

Question about simple ATT using never treated control group #37

Closed paulofelipe closed 6 months ago

paulofelipe commented 1 year ago

Hi @grantmcdermott ,

First of all, thank you for providing this valuable package. It has been instrumental in helping me understand the implementation details of the Extended Two-Way Fixed Effects method developed by Wooldridge.

I do, however, have a question regarding the computation of the simple ATT using a never treated control group. In my experiments, I have noticed significant differences between the results obtained from the etwfe function and those obtained from the Callaway-Sanntanna estimator, as well as when I manually run the etwfe procedure.

After executing the etwfe function, I computed the simple ATT using the marginaleffects::slopes() function and obtained an estimate of -0.0404, whereas the estimate I obtained using emfx() was -0.00166.

I must be overlooking some important detail. To provide further context, I have attached the details of my experiments below.

Thank you in advance!

library(did)
library(etwfe)
library(broom)
library(stringr)
library(marginaleffects)
library(fixest)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

options(knitr.kable.NA = "-")

# Data -------------------------------------------------------------------------
mpdta <- did::mpdta %>%
  mutate(
    .Dtreat = treat * (year >= first.treat),
    time_to_event = ifelse(first.treat != 0, year - first.treat, -1000)
  )

head(mpdta)
#>     year countyreal     lpop     lemp first.treat treat .Dtreat time_to_event
#> 866 2003       8001 5.896761 8.461469        2007     1       0            -4
#> 841 2004       8001 5.896761 8.336870        2007     1       0            -3
#> 842 2005       8001 5.896761 8.340217        2007     1       0            -2
#> 819 2006       8001 5.896761 8.378161        2007     1       0            -1
#> 827 2007       8001 5.896761 8.487352        2007     1       1             0
#> 937 2003       8019 2.232377 4.997212        2007     1       0            -4

# Callaway-Sant'Anna - Never Treated Control -----------------------------------

cs_never <- att_gt(
  yname = "lemp",
  tname = "year",
  idname = "countyreal",
  gname = "first.treat",
  data = mpdta,
  control_group = "nevertreated"
)

cs_never_simple <- aggte(cs_never, type = "simple") %>%
  tidy() %>%
  select(type, cs_never = estimate)

cs_never_dynamic <- aggte(cs_never, type = "dynamic") %>%
  tidy() %>%
  select(type, event = event.time, cs_never = estimate)

cs_never <- tidy(cs_never) %>%
  mutate(type = "group_time") %>%
  select(type, group, time, cs_never = estimate) %>%
  mutate(
    group = as.character(group),
    time = as.character(time)
  )

cs_never_res <- bind_rows(cs_never, cs_never_simple, cs_never_dynamic)

# Callaway-Sant'Anna - Not Yet Treated Control ---------------------------------

cs_notyet <- att_gt(
  yname = "lemp",
  tname = "year",
  idname = "countyreal",
  gname = "first.treat",
  data = mpdta,
  control_group = "notyettreated"
)

cs_notyet_simple <- aggte(cs_notyet, type = "simple") %>%
  tidy() %>%
  select(type, cs_notyet = estimate)

cs_notyet_dynamic <- aggte(cs_notyet, type = "dynamic") %>%
  tidy() %>%
  select(type, event = event.time, cs_notyet = estimate)

cs_notyet <- tidy(cs_notyet) %>%
  mutate(type = "group_time") %>%
  select(type, group, time, cs_notyet = estimate) %>%
  mutate(
    group = as.character(group),
    time = as.character(time)
  )

cs_notyet_res <- bind_rows(cs_notyet, cs_notyet_simple, cs_notyet_dynamic)

# Extended TWFE - Never Treated Control ----------------------------------------

etwfe_never <- etwfe(
  fml = lemp ~ 1,
  tvar = year,
  gvar = first.treat,
  data = mpdta,
  ivar = countyreal,
  cgroup = "never"
)

# compute simple ATT manually
slopes(
  model = etwfe_never,
  newdata = mpdta %>% filter(.Dtreat == 1),
  variable = ".Dtreat",
  by = ".Dtreat"
) %>%
  print()
#> 
#>     Term                 Contrast .Dtreat Estimate Std. Error     z Pr(>|z|)
#>  .Dtreat mean(TRUE) - mean(FALSE)       1  -0.0404     0.0196 -2.06   0.0396
#>    S   2.5 %   97.5 %
#>  4.7 -0.0789 -0.00193
#> 
#> Columns: term, contrast, .Dtreat, estimate, std.error, statistic, p.value, s.value, conf.low, conf.high, predicted, predicted_hi, predicted_lo

etwfe_never_simple <- emfx(etwfe_never, "simple") %>%
  tidy() %>%
  mutate(type = "simple") %>%
  select(type, etwfe_never = estimate)

etwfe_never_dynamic <- emfx(etwfe_never, "event") %>%
  tidy() %>%
  mutate(type = "dynamic") %>%
  select(type, event, etwfe_never = estimate) %>%
  filter(event < 2000)

etwfe_never <- tidy(etwfe_never) %>%
  mutate(
    type = "group_time",
    group = str_extract(term, "(?<=::)[0-9]{4}(?=:year)"),
    time = str_extract(term, "(?<=year::)[0-9]{4}")
  ) %>%
  select(type, group, time, etwfe_never = estimate)

etwfe_never_res <- bind_rows(
  etwfe_never, etwfe_never_simple, etwfe_never_dynamic
)

# Extended TWFE - Not Yet Treated Control --------------------------------------

etwfe_notyet <- etwfe(
  fml = lemp ~ 1,
  tvar = year,
  gvar = first.treat,
  data = mpdta,
  ivar = countyreal,
  cgroup = "notyet"
)

etwfe_notyet_simple <- emfx(etwfe_notyet, "simple") %>%
  tidy() %>%
  mutate(type = "simple") %>%
  select(type, etwfe_notyet = estimate)

etwfe_notyet_dynamic <- emfx(etwfe_notyet, "event") %>%
  tidy() %>%
  mutate(type = "dynamic") %>%
  select(type, event, etwfe_notyet = estimate) %>%
  filter(event < 2000)

etwfe_notyet <- tidy(etwfe_notyet) %>%
  mutate(
    type = "group_time",
    group = str_extract(term, "(?<=::)[0-9]{4}(?=:year)"),
    time = str_extract(term, "(?<=year::)[0-9]{4}")
  ) %>%
  select(type, group, time, etwfe_notyet = estimate)

etwfe_notyet_res <- bind_rows(
  etwfe_notyet, etwfe_notyet_simple, etwfe_notyet_dynamic
)

# Extended TWFE Manual - Never Treated Control ---------------------------------

manual_never <- feols(
  lemp ~
    treat:i(first.treat, i.time_to_event, ref = 0, ref2 = -1) |
      first.treat + year,
  data = mpdta,
  cluster = ~countyreal
)

manual_never_simple <- slopes(
  model = manual_never,
  newdata = mpdta %>% filter(time_to_event >= 0),
  variables = "treat",
  by = "treat",
) %>%
  tidy() %>%
  mutate(type = "simple") %>%
  select(type, manual_never = estimate)

manual_never_dynamic <- slopes(
  model = manual_never,
  newdata = mpdta %>% filter(treat == 1),
  variables = "treat",
  by = "time_to_event",
) %>%
  tidy() %>%
  mutate(type = "dynamic") %>%
  select(type, event = time_to_event, manual_never = estimate)

manual_never <- tidy(manual_never) %>%
  mutate(
    type = "group_time",
    group = str_extract(term, "(?<=::)[0-9]{4}(?=:time)"),
    time = str_extract(term, "(?<=time_to_event::)[-0-9]{1,2}"),
    time = as.numeric(group) + as.numeric(time),
    time = as.character(time)
  ) %>%
  select(type, group, time, manual_never = estimate)

manual_never_res <- bind_rows(
  manual_never, manual_never_simple, manual_never_dynamic
)

# Extended TWFE Manual - Not Yet Treated Control -------------------------------

manual_notyet <- feols(
  lemp ~
    .Dtreat:i(first.treat, i.time_to_event, ref = 0, ref2 = -1) |
      first.treat + year,
  data = mpdta,
  cluster = ~countyreal
)
#> The variables '.Dtreat:first.treat::2006:time_to_event::-3', '.Dtreat:first.treat::2006:time_to_event::-2' and three others have been removed because of collinearity (see $collin.var).

manual_notyet_simple <- slopes(
  model = manual_notyet,
  newdata = mpdta %>% filter(.Dtreat == 1),
  variables = ".Dtreat",
  by = ".Dtreat",
) %>%
  tidy() %>%
  mutate(type = "simple") %>%
  select(type, manual_notyet = estimate)

manual_notyet_dynamic <- slopes(
  model = manual_notyet,
  newdata = mpdta %>% filter(.Dtreat == 1),
  variables = ".Dtreat",
  by = "time_to_event",
) %>%
  tidy() %>%
  mutate(type = "dynamic") %>%
  select(type, event = time_to_event, manual_notyet = estimate)

manual_notyet <- tidy(manual_notyet) %>%
  mutate(
    type = "group_time",
    group = str_extract(term, "(?<=::)[0-9]{4}(?=:time)"),
    time = str_extract(term, "(?<=time_to_event::)[-0-9]{1,2}"),
    time = as.numeric(group) + as.numeric(time),
    time = as.character(time)
  ) %>%
  select(type, group, time, manual_notyet = estimate)

manual_notyet_res <- bind_rows(
  manual_notyet, manual_notyet_simple, manual_notyet_dynamic
)

# Combine ----------------------------------------------------------------------

results <- full_join(
  x = cs_never_res,
  y = cs_notyet_res,
  by = join_by(type, group, time, event)
) %>%
  full_join(
    y = etwfe_never_res,
    by = join_by(type, group, time, event)
  ) %>%
  full_join(
    y = etwfe_notyet_res,
    by = join_by(type, group, time, event)
  ) %>%
  full_join(
    y = manual_never_res,
    by = join_by(type, group, time, event)
  ) %>%
  full_join(
    y = manual_notyet_res,
    by = join_by(type, group, time, event)
  )

# Simple ATT -------------------------------------------------------------------

results %>%
  filter(type == "simple") %>%
  select(cs_never:manual_notyet, -event) %>%
  knitr::kable(digits = 4, format = "markdown")
cs_never cs_notyet etwfe_never etwfe_notyet manual_never manual_notyet
-0.04 -0.0398 -0.0017 -0.0477 -0.04 -0.0477

# Dynamic ATT ------------------------------------------------------------------

results %>%
  filter(type == "dynamic") %>%
  select(event, cs_never:manual_notyet) %>%
  arrange(event) %>%
  knitr::kable(digits = 4, format = "markdown")
event cs_never cs_notyet etwfe_never etwfe_notyet manual_never manual_notyet
-4 - - 0.0000 - 0.0033 -
-3 0.0305 0.0298 0.0234 - 0.0250 -
-2 -0.0006 -0.0024 0.0228 - 0.0245 -
-1 -0.0245 -0.0243 -0.0015 - 0.0000 -
0 -0.0199 -0.0189 -0.0214 -0.0311 -0.0199 -0.0311
1 -0.0510 -0.0536 -0.0484 -0.0522 -0.0510 -0.0522
2 -0.1373 -0.1363 -0.1373 -0.1361 -0.1373 -0.1361
3 -0.1008 -0.1008 -0.1008 -0.1047 -0.1008 -0.1047

# Group-Time ATT ---------------------------------------------------------------

results %>%
  filter(type == "group_time") %>%
  select(group, time, cs_never:manual_notyet, -event) %>%
  knitr::kable(digits = 4, format = "markdown")
group time cs_never cs_notyet etwfe_never etwfe_notyet manual_never manual_notyet
2004 2004 -0.0105 -0.0194 -0.0105 -0.0194 -0.0105 -0.0194
2004 2005 -0.0704 -0.0783 -0.0704 -0.0783 -0.0704 -0.0783
2004 2006 -0.1373 -0.1363 -0.1373 -0.1361 -0.1373 -0.1361
2004 2007 -0.1008 -0.1008 -0.1008 -0.1047 -0.1008 -0.1047
2006 2004 0.0065 -0.0026 0.0065 - 0.0028 -
2006 2005 -0.0028 -0.0019 0.0038 - - -
2006 2006 -0.0046 0.0047 -0.0008 0.0025 -0.0046 0.0025
2006 2007 -0.0412 -0.0412 -0.0375 -0.0392 -0.0412 -0.0392
2007 2004 0.0305 0.0298 0.0305 - 0.0338 -
2007 2005 -0.0027 -0.0024 0.0278 - 0.0311 -
2007 2006 -0.0311 -0.0311 -0.0033 - - -
2007 2007 -0.0261 -0.0261 -0.0294 -0.0431 -0.0261 -0.0431
2006 2003 - - - - -0.0038 -
2007 2003 - - - - 0.0033 -

Created on 2023-06-21 with reprex v2.0.2

grantmcdermott commented 7 months ago

Hi @paulofelipe, sorry this completely slipped through the cracks. I only saw it again now after responding to another issue. There's a lot going on in the above code. Can you simplify it and just highlight the key discrepancy that you're wondering about? That will make it easier for me to help you.

paulofelipe commented 7 months ago

Hi, @grantmcdermott. I apologize for not being concise in my initial question. My doubt pertains to the value obtained for the Simple ATT using the following code:

library(did)
library(etwfe)
library(broom)
library(stringr)
library(marginaleffects)
library(fixest)
library(dplyr)

mpdta <- did::mpdta %>%
  mutate(
    .Dtreat = treat * (year >= first.treat),
    time_to_event = ifelse(first.treat != 0, year - first.treat, -1000)
  )

etwfe_never <- etwfe(
  fml = lemp ~ 1,
  tvar = year,
  gvar = first.treat,
  data = mpdta,
  ivar = countyreal,
  cgroup = "never"
)

emfx(etwfe_never, "simple") %>%
  tidy() %>%
  print()
#> # A tibble: 1 × 13
#>   term    contrast .Dtreat estimate std.error statistic p.value s.value conf.low
#>   <chr>   <chr>    <lgl>      <dbl>     <dbl>     <dbl>   <dbl>   <dbl>    <dbl>
#> 1 .Dtreat mean(TR… TRUE    -0.00166   0.00489    -0.339   0.735   0.445  -0.0112
#> # ℹ 4 more variables: conf.high <dbl>, predicted <dbl>, predicted_hi <dbl>,
#> #   predicted_lo <dbl>

slopes(
  model = etwfe_never,
  newdata = mpdta %>% filter(.Dtreat == 1),
  variable = ".Dtreat",
  by = ".Dtreat"
) %>%
  print()
#> 
#>     Term                 Contrast .Dtreat Estimate Std. Error     z Pr(>|z|)
#>  .Dtreat mean(TRUE) - mean(FALSE)       1  -0.0404     0.0196 -2.06   0.0396
#>    S   2.5 %   97.5 %
#>  4.7 -0.0789 -0.00193
#> 
#> Columns: term, contrast, .Dtreat, estimate, std.error, statistic, p.value, s.value, conf.low, conf.high, predicted, predicted_hi, predicted_lo 

The obtained result (-0.00166) using the “never treated” group as the control diverges significantly from that obtained using the “not yet treated” control group (-0.0477) and from the manually calculated simple ATT values (-0.0404) using the marginaleffects package. Perhaps the code I used with the slopes() function is incorrect, but I can replicate the results (using emfx and slopes) when I change to cgroup = "notyet" in the etwfe() function.

Thanks in advance.

grantmcdermott commented 6 months ago

Ah, I see. Thanks @paulofelipe. I think this might be a subsetting bug for the never-treated case. Let me dig a little deeper and confirm.

grantmcdermott commented 6 months ago

Hi again @paulofelipe. Thanks again for reporting this and bearing with my tawdry response. It was indeed a bug, so I'm very grateful that you brought it to my attention. It should be fixed now if you grab the latest development version.

Quick MWE based off of your example.

library(etwfe)
data("mpdta", package = "did")

etwfe_never = etwfe(
  fml = lemp ~ 1,
  tvar = year,
  gvar = first.treat,
  data = mpdta,
  ivar = countyreal,
  cgroup = "never"
)

emfx(etwfe_never, "simple")
#> 
#>     Term                 Contrast .Dtreat Estimate Std. Error     z Pr(>|z|)
#>  .Dtreat mean(TRUE) - mean(FALSE)    TRUE    -0.04     0.0118 -3.39   <0.001
#>     S   2.5 %  97.5 %
#>  10.5 -0.0631 -0.0168
#> 
#> Columns: term, contrast, .Dtreat, estimate, std.error, statistic, p.value, s.value, conf.low, conf.high, predicted_lo, predicted_hi, predicted 
#> Type:  response
emfx(etwfe_never, "event")
#> 
#>     Term                 Contrast event Estimate Std. Error      z Pr(>|z|)
#>  .Dtreat mean(TRUE) - mean(FALSE)    -4  0.00331     0.0246  0.135  0.89289
#>  .Dtreat mean(TRUE) - mean(FALSE)    -3  0.02502     0.0182  1.378  0.16812
#>  .Dtreat mean(TRUE) - mean(FALSE)    -2  0.02446     0.0143  1.714  0.08646
#>  .Dtreat mean(TRUE) - mean(FALSE)    -1  0.00000         NA     NA       NA
#>  .Dtreat mean(TRUE) - mean(FALSE)     0 -0.01993     0.0119 -1.681  0.09277
#>  .Dtreat mean(TRUE) - mean(FALSE)     1 -0.05096     0.0169 -3.020  0.00252
#>  .Dtreat mean(TRUE) - mean(FALSE)     2 -0.13726     0.0366 -3.751  < 0.001
#>  .Dtreat mean(TRUE) - mean(FALSE)     3 -0.10081     0.0345 -2.922  0.00348
#>     S   2.5 %   97.5 %
#>   0.2 -0.0448  0.05143
#>   2.6 -0.0106  0.06060
#>   3.5 -0.0035  0.05242
#>    NA      NA       NA
#>   3.4 -0.0432  0.00331
#>   8.6 -0.0840 -0.01789
#>  12.5 -0.2090 -0.06554
#>   8.2 -0.1684 -0.03318
#> 
#> Columns: term, contrast, event, estimate, std.error, statistic, p.value, s.value, conf.low, conf.high, predicted_lo, predicted_hi, predicted 
#> Type:  response

Created on 2024-02-25 with reprex v2.1.0

paulofelipe commented 6 months ago

Hi, @grantmcdermott

Nice! Thank you for providing the new version!

grantmcdermott commented 6 months ago

Fixed version (v0.4.0) is up on CRAN now too.