MatthieuStigler / multiDiff

Multi-period diff and diff
Other
3 stars 0 forks source link

anova? #21

Open MatthieuStigler opened 1 year ago

MatthieuStigler commented 1 year ago

feols() residuals is muuuch faster!

very important to make sure the vars are as factor!

library(multiDiff)

## sim dat
dat_did_df <- sim_dat_common(timing_treatment = 5, seed = 97) |> 
  dplyr::mutate(across(c(Time, unit), as.factor))
head(dat_did_df)
#> # A tibble: 6 × 5
#>   unit  Time  treat_group    tr      y
#>   <fct> <fct> <chr>       <dbl>  <dbl>
#> 1 1     1     treated         0 -0.746
#> 2 1     2     treated         0 -1.30 
#> 3 1     3     treated         0 -1.25 
#> 4 1     4     treated         0 -0.991
#> 5 1     5     treated         1 -1.06 
#> 6 1     6     treated         0 -1.27

## get total var
var(dat_did_df$y)
#> [1] 2.713643
v_full <- var(residuals(fixest::feols(y~1, data=dat_did_df)))
all.equal(v_full, var(dat_did_df$y))
#> [1] TRUE

## get conditioanl vars
var2 <- function(x) drop(crossprod(x)/length(x))
res_FE_T <- residuals(fixest::feols(y~1|Time, data=dat_did_df))
v_FE_T <- var(res_FE_T)
res_FE_N <- residuals(fixest::feols(y~1|unit, data=dat_did_df))
v_FE_N <- var(res_FE_N)
v_FE_NT <- var(residuals(fixest::feols(y~1|Time+unit, data=dat_did_df)))

v_FE_T_then_N <- var(residuals(fixest::feols(res~1|unit, data=dat_did_df |> 
                                               dplyr::mutate(res = res_FE_T))))
v_FE_N_then_T <- var(residuals(fixest::feols(res~1|Time, data=dat_did_df |> 
                                               dplyr::mutate(res = res_FE_N))))

ano_manu_lm <- c(v_full=v_full, v_FE_T=v_FE_T, v_FE_N=v_FE_N, v_FE_NT=v_FE_NT,
                 v_FE_T_then_N = v_FE_T_then_N, v_FE_N_then_T=v_FE_N_then_T)
ano_manu_lm
#>        v_full        v_FE_T        v_FE_N       v_FE_NT v_FE_T_then_N 
#>     2.7136432     2.0314706     1.5900935     0.9079209     0.9079209 
#> v_FE_N_then_T 
#>     0.9079209
ano_manu_lm/ano_manu_lm[1]
#>        v_full        v_FE_T        v_FE_N       v_FE_NT v_FE_T_then_N 
#>     1.0000000     0.7486137     0.5859626     0.3345764     0.3345764 
#> v_FE_N_then_T 
#>     0.3345764

all.equal(ano_manu_lm[["v_FE_T_then_N"]],
          ano_manu_lm[["v_FE_NT"]])
#> [1] TRUE

## anova total SSR
SSR_tot_manu <- crossprod(dat_did_df$y-mean(dat_did_df$y)) |> drop()
SSR_tot_anova <- aov(y~Time, data =dat_did_df) |> broom::tidy() |> 
  dplyr::pull(sumsq) |> sum()
all.equal(SSR_tot_anova, SSR_tot_manu)
#> [1] TRUE

##
get_ssr_aov <- function(aov){
  aov |>
    broom::tidy() |> 
    dplyr::select(term, sumsq) |> 
    dplyr::mutate(perc = 100* sumsq/sum(sumsq)) 
}

## by Y
AOV_T <- aov(y~Time, data =dat_did_df) |> get_ssr_aov()
AOV_T
#> # A tibble: 2 × 3
#>   term       sumsq  perc
#>   <chr>      <dbl> <dbl>
#> 1 Time       6821.  25.1
#> 2 Residuals 20313.  74.9
all.equal(AOV_T$perc[1],
          100*(1-v_FE_T/v_full))
#> [1] TRUE

AOV_N <- aov(y~unit, data =dat_did_df) |> get_ssr_aov()
AOV_N
#> # A tibble: 2 × 3
#>   term       sumsq  perc
#>   <chr>      <dbl> <dbl>
#> 1 unit      11234.  41.4
#> 2 Residuals 15899.  58.6
all.equal(AOV_N$perc[1], 100*(1-v_FE_N/v_full))
#> [1] TRUE

AOV_NT <- aov(y~unit+Time, data =dat_did_df) |> get_ssr_aov()
AOV_TN <- aov(y~Time +unit, data =dat_did_df) |> get_ssr_aov()
all.equal(AOV_NT$perc[1:2], AOV_TN$perc[2:1])
#> [1] TRUE
AOV_NT
#> # A tibble: 3 × 3
#>   term       sumsq  perc
#>   <chr>      <dbl> <dbl>
#> 1 unit      11234.  41.4
#> 2 Time       6821.  25.1
#> 3 Residuals  9078.  33.5
AOV_TN
#> # A tibble: 3 × 3
#>   term       sumsq  perc
#>   <chr>      <dbl> <dbl>
#> 1 Time       6821.  25.1
#> 2 unit      11234.  41.4
#> 3 Residuals  9078.  33.5
all.equal(sum(AOV_NT$perc[1:2]), 100*(1-v_FE_NT/v_full))
#> [1] TRUE

Created on 2023-10-15 with reprex v2.0.2

MatthieuStigler commented 1 year ago

If variables not stored as factor, doesn't work for anova!!

library(multiDiff)

## sim dat
dat_did_df <- sim_dat_common(timing_treatment = 5, seed = 97) 
head(dat_did_df)
#> # A tibble: 6 × 5
#>    unit  Time treat_group    tr      y
#>   <int> <int> <chr>       <dbl>  <dbl>
#> 1     1     1 treated         0 -0.746
#> 2     1     2 treated         0 -1.30 
#> 3     1     3 treated         0 -1.25 
#> 4     1     4 treated         0 -0.991
#> 5     1     5 treated         1 -1.06 
#> 6     1     6 treated         0 -1.27

## get total var
var(dat_did_df$y)
#> [1] 2.713643
v_full <- var(residuals(fixest::feols(y~1, data=dat_did_df)))
all.equal(v_full, var(dat_did_df$y))
#> [1] TRUE

## get conditioanl vars
var2 <- function(x) drop(crossprod(x)/length(x))
res_FE_T <- residuals(fixest::feols(y~1|Time, data=dat_did_df))
v_FE_T <- var(res_FE_T)
res_FE_N <- residuals(fixest::feols(y~1|unit, data=dat_did_df))
v_FE_N <- var(res_FE_N)
v_FE_NT <- var(residuals(fixest::feols(y~1|Time+unit, data=dat_did_df)))

v_FE_T_then_N <- var(residuals(fixest::feols(res~1|unit, data=dat_did_df |> 
                                               dplyr::mutate(res = res_FE_T))))
v_FE_N_then_T <- var(residuals(fixest::feols(res~1|Time, data=dat_did_df |> 
                                               dplyr::mutate(res = res_FE_N))))

ano_manu_lm <- c(v_full=v_full, v_FE_T=v_FE_T, v_FE_N=v_FE_N, v_FE_NT=v_FE_NT,
                 v_FE_T_then_N = v_FE_T_then_N, v_FE_N_then_T=v_FE_N_then_T)
ano_manu_lm
#>        v_full        v_FE_T        v_FE_N       v_FE_NT v_FE_T_then_N 
#>     2.7136432     2.0314706     1.5900935     0.9079209     0.9079209 
#> v_FE_N_then_T 
#>     0.9079209
ano_manu_lm/ano_manu_lm[1]
#>        v_full        v_FE_T        v_FE_N       v_FE_NT v_FE_T_then_N 
#>     1.0000000     0.7486137     0.5859626     0.3345764     0.3345764 
#> v_FE_N_then_T 
#>     0.3345764

all.equal(ano_manu_lm[["v_FE_T_then_N"]],
          ano_manu_lm[["v_FE_NT"]])
#> [1] TRUE

## anova total SSR
SSR_tot_manu <- crossprod(dat_did_df$y-mean(dat_did_df$y)) |> drop()
SSR_tot_anova <- aov(y~Time, data =dat_did_df) |> broom::tidy() |> 
  dplyr::pull(sumsq) |> sum()
all.equal(SSR_tot_anova, SSR_tot_manu)
#> [1] TRUE

##
get_ssr_aov <- function(aov){
  aov |>
    broom::tidy() |> 
    dplyr::select(term, sumsq) |> 
    dplyr::mutate(perc = 100* sumsq/sum(sumsq)) 
}

## by Y
AOV_T <- aov(y~Time, data =dat_did_df) |> get_ssr_aov()
AOV_T
#> # A tibble: 2 × 3
#>   term       sumsq  perc
#>   <chr>      <dbl> <dbl>
#> 1 Time       1579.  5.82
#> 2 Residuals 25554. 94.2
all.equal(AOV_T$perc[1],
          100*(1-v_FE_T/v_full))
#> [1] "Mean relative difference: 3.318918"

AOV_N <- aov(y~unit, data =dat_did_df) |> get_ssr_aov()
AOV_N
#> # A tibble: 2 × 3
#>   term         sumsq     perc
#>   <chr>        <dbl>    <dbl>
#> 1 unit          4.49   0.0165
#> 2 Residuals 27129.   100.
all.equal(AOV_N$perc[1], 100*(1-v_FE_N/v_full))
#> [1] "Mean relative difference: 2500.989"

AOV_NT <- aov(y~unit+Time, data =dat_did_df) |> get_ssr_aov()
AOV_TN <- aov(y~Time +unit, data =dat_did_df) |> get_ssr_aov()
all.equal(AOV_NT$perc[1:2], AOV_TN$perc[2:1])
#> [1] TRUE
AOV_NT
#> # A tibble: 3 × 3
#>   term         sumsq    perc
#>   <chr>        <dbl>   <dbl>
#> 1 unit          4.49  0.0165
#> 2 Time       1579.    5.82  
#> 3 Residuals 25550.   94.2
AOV_TN
#> # A tibble: 3 × 3
#>   term         sumsq    perc
#>   <chr>        <dbl>   <dbl>
#> 1 Time       1579.    5.82  
#> 2 unit          4.49  0.0165
#> 3 Residuals 25550.   94.2
all.equal(sum(AOV_NT$perc[1:2]), 100*(1-v_FE_NT/v_full))
#> [1] "Mean relative difference: 10.39984"

Created on 2023-10-15 with reprex v2.0.2