PSLmodels / OG-Core

An overlapping generations model framework for evaluating fiscal policies.
https://pslmodels.github.io/OG-Core/
Creative Commons Zero v1.0 Universal
68 stars 119 forks source link

Monotonic spline tax function estimation not working #883

Open jdebacker opened 1 year ago

jdebacker commented 1 year ago

I've recently run into issues trying to estimate tax functions using the mono and mono2D options, which estimate spline functions in 1D and 2D (respectively), with constraints that these functions are weakly monotonically increasing.

In order to investigate whether this is an issue with the underlying data (I ran into this issue my work in OG-USA PR #73) or something related to the tax function estimation routines, I tested the estimation with a cached file output from the taxcalc model. I report the results below, but it appears that the spline functions do seem to work (with the mono2D requiring one to build pyGAM from the source code).

Mono 1D

Here's a minimal example:

import ogcore.txfunc
from ogcore.utils import safe_read_pickle
micro_data = safe_read_pickle("./tests/test_io_data/micro_data_dict_for_tests.pkl")
tax_params = ogcore.txfunc.tax_func_estimate(micro_data, 1, 80, 20, 100, start_year=2030, tax_func_type="mono")

Which seems to run fine:

year= 2030 age= all ages
Finished tax function loop through 1 years and 1 ages per year.
Tax function estimation time: 3.113 sec

Mono 2D

Here's a minimal example:

import ogcore.txfunc
from ogcore.utils import safe_read_pickle
micro_data = safe_read_pickle("./tests/test_io_data/micro_data_dict_for_tests.pkl")
tax_params = ogcore.txfunc.tax_func_estimate(micro_data, 1, 80, 20, 100, start_year=2030, tax_func_type="mono2D")

Which seems to run into an error showing the conflict between more recent versions of Numpy and pyGAM 0.9.0:

BW =  1 begin year =  2030 end year =  2030
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[8], line 1
----> 1 tax_params = ogcore.txfunc.tax_func_estimate(micro_data, 1, 80, 20, 100, start_year=2030, tax_func_type="mono2D")

File ~/repos/OG-Core/ogcore/txfunc.py:1520, in tax_func_estimate(micro_data, BW, S, starting_age, ending_age, start_year, baseline, analytical_mtrs, tax_func_type, age_specific, reform, data, desc_data, graph_data, graph_est, client, num_workers, tax_func_path)
   1518     results = client.gather(futures)
   1519 else:
-> 1520     results = results = compute(
   1521         *lazy_values,
   1522         scheduler=dask.multiprocessing.get,
   1523         num_workers=num_workers,
   1524     )
   1526 # Garbage collection
   1527 del micro_data

File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/dask/base.py:595, in compute(traverse, optimize_graph, scheduler, get, *args, **kwargs)
    592     keys.append(x.__dask_keys__())
    593     postcomputes.append(x.__dask_postcompute__())
--> 595 results = schedule(dsk, keys, **kwargs)
    596 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])

File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/dask/multiprocessing.py:233, in get(dsk, keys, num_workers, func_loads, func_dumps, optimize_graph, pool, initializer, chunksize, **kwargs)
    228 # Note former versions used a multiprocessing Manager to share
    229 # a Queue between parent and workers, but this is fragile on Windows
    230 # (issue #1652).
    231 try:
    232     # Run
--> 233     result = get_async(
    234         pool.submit,
    235         pool._max_workers,
    236         dsk3,
    237         keys,
    238         get_id=_process_get_id,
    239         dumps=dumps,
    240         loads=loads,
    241         pack_exception=pack_exception,
    242         raise_exception=reraise,
    243         chunksize=chunksize,
    244         **kwargs,
    245     )
    246 finally:
    247     if cleanup:

File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/dask/local.py:511, in get_async(submit, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, chunksize, **kwargs)
    509         _execute_task(task, data)  # Re-execute locally
    510     else:
--> 511         raise_exception(exc, tb)
    512 res, worker_id = loads(res_info)
    513 state["cache"][key] = res

File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/dask/local.py:319, in reraise(exc, tb)
    317 if exc.__traceback__ is not tb:
    318     raise exc.with_traceback(tb)
--> 319 raise exc

File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/dask/local.py:224, in execute_task()
    222 try:
    223     task, data = loads(task_info)
--> 224     result = _execute_task(task, data)
    225     id = get_id()
    226     result = dumps((result, id))

File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/dask/core.py:121, in _execute_task()
    117     func, args = arg[0], arg[1:]
    118     # Note: Don't assign the subtask results to a variable. numpy detects
    119     # temporaries by their reference count and can execute certain
    120     # operations in-place.
--> 121     return func(*(_execute_task(a, cache) for a in args))
    122 elif not ishashable(arg):
    123     return arg

File ~/repos/OG-Core/ogcore/txfunc.py:1169, in tax_func_loop()
   1162     pp.gen_3Dscatters_hist(df_etr, s, t, output_dir)
   1164 # Estimate effective tax rate function ETR(x,y)
   1165 (
   1166     etrparams,
   1167     etr_wsumsq_arr[s - s_min],
   1168     etr_obs_arr[s - s_min],
-> 1169 ) = txfunc_est(
   1170     df_etr,
   1171     s,
   1172     t,
   1173     "etr",
   1174     tax_func_type,
   1175     numparams,
   1176     output_dir,
   1177     graph_est,
   1178 )
   1179 etrparam_list[s - s_min] = etrparams
   1180 del df_etr

File ~/repos/OG-Core/ogcore/txfunc.py:811, in txfunc_est()
    809 elif tax_func_type == "mono2D":
    810     obs = df.shape[0]
--> 811     mono_interp, _, wsse_cstr, _, _ = monotone_spline(
    812         # df[["total_labinc", "total_capinc"]].values,
    813         # df["etr"].values,
    814         # df["weight"].values,
    815         np.vstack((X, Y)).T,
    816         # X, Y,
    817         txrates,
    818         wgts,
    819         bins=[100, 100],
    820         method="pygam",
    821         splines=[100, 100],
    822     )
    823     wsse = wsse_cstr
    824     params = [mono_interp]

File ~/repos/OG-Core/ogcore/txfunc.py:1904, in monotone_spline()
   1901         tempUncstr += s(i, n_splines=splines[i])
   1903 # fit data
-> 1904 gamCstr = LinearGAM(tempCstr).fit(x_binned, y_binned, weights_binned)
   1905 y_cstr = gamCstr.predict(x_binned)
   1906 wsse_cstr = (weights_binned * ((y_cstr - y_binned) ** 2)).sum()

File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/pygam/pygam.py:920, in fit()
    917 self.statistics_['m_features'] = X.shape[1]
    919 # optimize
--> 920 self._pirls(X, y, weights)
    921 # if self._opt == 0:
    922 #     self._pirls(X, y, weights)
    923 # if self._opt == 1:
    924 #     self._pirls_naive(X, y)
    925 return self

File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/pygam/pygam.py:686, in _pirls()
    669 def _pirls(self, X, Y, weights):
    670     """
    671     Performs stable PIRLS iterations to estimate GAM coefficients
    672
   (...)
    684     None
    685     """
--> 686     modelmat = self._modelmat(X) # build a basis matrix for the GLM
    687     n, m = modelmat.shape
    689     # initialize GLM coefficients if model is not yet fitted

File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/pygam/pygam.py:459, in _modelmat()
    437 """
    438 Builds a model matrix, B, out of the spline basis for each feature
    439
   (...)
    453     containing model matrix of the spline basis for selected features
    454 """
    455 X = check_X(X, n_feats=self.statistics_['m_features'],
    456             edge_knots=self.edge_knots_, dtypes=self.dtype,
    457             features=self.feature, verbose=self.verbose)
--> 459 return self.terms.build_columns(X, term=term)

File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/pygam/terms.py:1719, in build_columns()
   1717 columns = []
   1718 for term_id in term:
-> 1719     columns.append(self._terms[term_id].build_columns(X, verbose=verbose))
   1720 return sp.sparse.hstack(columns, format='csc')

File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/pygam/terms.py:784, in build_columns()
    768 """construct the model matrix columns for the term
    769
    770 Parameters
   (...)
    780 scipy sparse array with n rows
    781 """
    782 X[:, self.feature][:, np.newaxis]
--> 784 splines = b_spline_basis(X[:, self.feature],
    785                          edge_knots=self.edge_knots_,
    786                          spline_order=self.spline_order,
    787                          n_splines=self.n_splines,
    788                          sparse=True,
    789                          periodic=self.basis in ['cp'],
    790                          verbose=verbose)
    792 if self.by is not None:
    793     splines = splines.multiply(X[:, self.by][:, np.newaxis])

File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/pygam/utils.py:649, in b_spline_basis()
    646 aug_knots[-1] += 1e-9 # want last knot inclusive
    648 # prepare Haar Basis
--> 649 bases = (x >= aug_knots[:-1]).astype(np.int) * \
    650         (x < aug_knots[1:]).astype(np.int)
    651 bases[-1] = bases[-2][::-1] # force symmetric bases at 0 and 1
    653 # do recursion from Hastie et al. vectorized

File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/numpy/__init__.py:305, in __getattr__()
    300     warnings.warn(
    301         f"In the future `np.{attr}` will be defined as the "
    302         "corresponding NumPy scalar.", FutureWarning, stacklevel=2)
    304 if attr in __former_attrs__:
--> 305     raise AttributeError(__former_attrs__[attr])
    307 # Importing Tester requires importing all of UnitTest which is not a
    308 # cheap import Since it is mainly used in test suits, we lazy import it
    309 # here to save on the order of 10 ms of import time for most users
    310 #
    311 # The previous way Tester was imported also had a side effect of adding
    312 # the full `numpy.testing` namespace
    313 if attr == 'testing':

AttributeError: module 'numpy' has no attribute 'int'.
`np.int` was a deprecated alias for the builtin `int`. To avoid this error in existing code, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

I have also installed pyGAM from source (the master branch includes a fix to the np.int() issue. With this change, the above example appears to run fine (after a bunch of warnings):

Finished tax function loop through 1 years and 1 ages per year.
Tax function estimation time: 3 min, 36.1 sec

cc @rickecon @prrathi