I've recently run into issues trying to estimate tax functions using the mono and mono2D options, which estimate spline functions in 1D and 2D (respectively), with constraints that these functions are weakly monotonically increasing.
In order to investigate whether this is an issue with the underlying data (I ran into this issue my work in OG-USA PR #73) or something related to the tax function estimation routines, I tested the estimation with a cached file output from the taxcalc model. I report the results below, but it appears that the spline functions do seem to work (with the mono2D requiring one to build pyGAM from the source code).
Which seems to run into an error showing the conflict between more recent versions of Numpy and pyGAM 0.9.0:
BW = 1 begin year = 2030 end year = 2030
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In[8], line 1
----> 1 tax_params = ogcore.txfunc.tax_func_estimate(micro_data, 1, 80, 20, 100, start_year=2030, tax_func_type="mono2D")
File ~/repos/OG-Core/ogcore/txfunc.py:1520, in tax_func_estimate(micro_data, BW, S, starting_age, ending_age, start_year, baseline, analytical_mtrs, tax_func_type, age_specific, reform, data, desc_data, graph_data, graph_est, client, num_workers, tax_func_path)
1518 results = client.gather(futures)
1519 else:
-> 1520 results = results = compute(
1521 *lazy_values,
1522 scheduler=dask.multiprocessing.get,
1523 num_workers=num_workers,
1524 )
1526 # Garbage collection
1527 del micro_data
File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/dask/base.py:595, in compute(traverse, optimize_graph, scheduler, get, *args, **kwargs)
592 keys.append(x.__dask_keys__())
593 postcomputes.append(x.__dask_postcompute__())
--> 595 results = schedule(dsk, keys, **kwargs)
596 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/dask/multiprocessing.py:233, in get(dsk, keys, num_workers, func_loads, func_dumps, optimize_graph, pool, initializer, chunksize, **kwargs)
228 # Note former versions used a multiprocessing Manager to share
229 # a Queue between parent and workers, but this is fragile on Windows
230 # (issue #1652).
231 try:
232 # Run
--> 233 result = get_async(
234 pool.submit,
235 pool._max_workers,
236 dsk3,
237 keys,
238 get_id=_process_get_id,
239 dumps=dumps,
240 loads=loads,
241 pack_exception=pack_exception,
242 raise_exception=reraise,
243 chunksize=chunksize,
244 **kwargs,
245 )
246 finally:
247 if cleanup:
File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/dask/local.py:511, in get_async(submit, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, chunksize, **kwargs)
509 _execute_task(task, data) # Re-execute locally
510 else:
--> 511 raise_exception(exc, tb)
512 res, worker_id = loads(res_info)
513 state["cache"][key] = res
File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/dask/local.py:319, in reraise(exc, tb)
317 if exc.__traceback__ is not tb:
318 raise exc.with_traceback(tb)
--> 319 raise exc
File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/dask/local.py:224, in execute_task()
222 try:
223 task, data = loads(task_info)
--> 224 result = _execute_task(task, data)
225 id = get_id()
226 result = dumps((result, id))
File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/dask/core.py:121, in _execute_task()
117 func, args = arg[0], arg[1:]
118 # Note: Don't assign the subtask results to a variable. numpy detects
119 # temporaries by their reference count and can execute certain
120 # operations in-place.
--> 121 return func(*(_execute_task(a, cache) for a in args))
122 elif not ishashable(arg):
123 return arg
File ~/repos/OG-Core/ogcore/txfunc.py:1169, in tax_func_loop()
1162 pp.gen_3Dscatters_hist(df_etr, s, t, output_dir)
1164 # Estimate effective tax rate function ETR(x,y)
1165 (
1166 etrparams,
1167 etr_wsumsq_arr[s - s_min],
1168 etr_obs_arr[s - s_min],
-> 1169 ) = txfunc_est(
1170 df_etr,
1171 s,
1172 t,
1173 "etr",
1174 tax_func_type,
1175 numparams,
1176 output_dir,
1177 graph_est,
1178 )
1179 etrparam_list[s - s_min] = etrparams
1180 del df_etr
File ~/repos/OG-Core/ogcore/txfunc.py:811, in txfunc_est()
809 elif tax_func_type == "mono2D":
810 obs = df.shape[0]
--> 811 mono_interp, _, wsse_cstr, _, _ = monotone_spline(
812 # df[["total_labinc", "total_capinc"]].values,
813 # df["etr"].values,
814 # df["weight"].values,
815 np.vstack((X, Y)).T,
816 # X, Y,
817 txrates,
818 wgts,
819 bins=[100, 100],
820 method="pygam",
821 splines=[100, 100],
822 )
823 wsse = wsse_cstr
824 params = [mono_interp]
File ~/repos/OG-Core/ogcore/txfunc.py:1904, in monotone_spline()
1901 tempUncstr += s(i, n_splines=splines[i])
1903 # fit data
-> 1904 gamCstr = LinearGAM(tempCstr).fit(x_binned, y_binned, weights_binned)
1905 y_cstr = gamCstr.predict(x_binned)
1906 wsse_cstr = (weights_binned * ((y_cstr - y_binned) ** 2)).sum()
File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/pygam/pygam.py:920, in fit()
917 self.statistics_['m_features'] = X.shape[1]
919 # optimize
--> 920 self._pirls(X, y, weights)
921 # if self._opt == 0:
922 # self._pirls(X, y, weights)
923 # if self._opt == 1:
924 # self._pirls_naive(X, y)
925 return self
File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/pygam/pygam.py:686, in _pirls()
669 def _pirls(self, X, Y, weights):
670 """
671 Performs stable PIRLS iterations to estimate GAM coefficients
672
(...)
684 None
685 """
--> 686 modelmat = self._modelmat(X) # build a basis matrix for the GLM
687 n, m = modelmat.shape
689 # initialize GLM coefficients if model is not yet fitted
File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/pygam/pygam.py:459, in _modelmat()
437 """
438 Builds a model matrix, B, out of the spline basis for each feature
439
(...)
453 containing model matrix of the spline basis for selected features
454 """
455 X = check_X(X, n_feats=self.statistics_['m_features'],
456 edge_knots=self.edge_knots_, dtypes=self.dtype,
457 features=self.feature, verbose=self.verbose)
--> 459 return self.terms.build_columns(X, term=term)
File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/pygam/terms.py:1719, in build_columns()
1717 columns = []
1718 for term_id in term:
-> 1719 columns.append(self._terms[term_id].build_columns(X, verbose=verbose))
1720 return sp.sparse.hstack(columns, format='csc')
File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/pygam/terms.py:784, in build_columns()
768 """construct the model matrix columns for the term
769
770 Parameters
(...)
780 scipy sparse array with n rows
781 """
782 X[:, self.feature][:, np.newaxis]
--> 784 splines = b_spline_basis(X[:, self.feature],
785 edge_knots=self.edge_knots_,
786 spline_order=self.spline_order,
787 n_splines=self.n_splines,
788 sparse=True,
789 periodic=self.basis in ['cp'],
790 verbose=verbose)
792 if self.by is not None:
793 splines = splines.multiply(X[:, self.by][:, np.newaxis])
File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/pygam/utils.py:649, in b_spline_basis()
646 aug_knots[-1] += 1e-9 # want last knot inclusive
648 # prepare Haar Basis
--> 649 bases = (x >= aug_knots[:-1]).astype(np.int) * \
650 (x < aug_knots[1:]).astype(np.int)
651 bases[-1] = bases[-2][::-1] # force symmetric bases at 0 and 1
653 # do recursion from Hastie et al. vectorized
File ~/anaconda3/envs/ogcore-dev/lib/python3.10/site-packages/numpy/__init__.py:305, in __getattr__()
300 warnings.warn(
301 f"In the future `np.{attr}` will be defined as the "
302 "corresponding NumPy scalar.", FutureWarning, stacklevel=2)
304 if attr in __former_attrs__:
--> 305 raise AttributeError(__former_attrs__[attr])
307 # Importing Tester requires importing all of UnitTest which is not a
308 # cheap import Since it is mainly used in test suits, we lazy import it
309 # here to save on the order of 10 ms of import time for most users
310 #
311 # The previous way Tester was imported also had a side effect of adding
312 # the full `numpy.testing` namespace
313 if attr == 'testing':
AttributeError: module 'numpy' has no attribute 'int'.
`np.int` was a deprecated alias for the builtin `int`. To avoid this error in existing code, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
I have also installed pyGAM from source (the master branch includes a fix to the np.int() issue. With this change, the above example appears to run fine (after a bunch of warnings):
Finished tax function loop through 1 years and 1 ages per year.
Tax function estimation time: 3 min, 36.1 sec
I've recently run into issues trying to estimate tax functions using the
mono
andmono2D
options, which estimate spline functions in 1D and 2D (respectively), with constraints that these functions are weakly monotonically increasing.In order to investigate whether this is an issue with the underlying data (I ran into this issue my work in OG-USA PR #73) or something related to the tax function estimation routines, I tested the estimation with a cached file output from the
taxcalc
model. I report the results below, but it appears that the spline functions do seem to work (with themono2D
requiring one to buildpyGAM
from the source code).Mono 1D
Here's a minimal example:
Which seems to run fine:
Mono 2D
Here's a minimal example:
Which seems to run into an error showing the conflict between more recent versions of Numpy and pyGAM 0.9.0:
I have also installed
pyGAM
from source (the master branch includes a fix to thenp.int()
issue. With this change, the above example appears to run fine (after a bunch of warnings):cc @rickecon @prrathi