stefan-jansen / machine-learning-for-trading

Code for Machine Learning for Algorithmic Trading, 2nd edition.
https://ml4trading.io
12.57k stars 4.03k forks source link

05_predicting_stock_returns_with_linear_regression.ipynb || ValueError: cannot reindex on an axis with duplicate labels #272

Closed cconw closed 1 year ago

cconw commented 1 year ago

I've been at this for a few hours trying to debug myself and understand this error and I simply cannot figure it out. It's equally perplexing to me because the ridge_scores visualization has a virtually identical call to seaborn earlier in the script and that works. If anyone has any ideas, I would appreciate it!

this specific line:

ax = sns.lineplot(x='alpha', y='ic', data=lasso_scores, estimator=np.mean, label='Mean', ax=axes[0])

is generating this error:


ValueError Traceback (most recent call last) Cell In [144], line 6 4 best_alpha_mean = scores_by_alpha['mean'].idxmax() 5 best_alpha_median = scores_by_alpha['median'].idxmax() ----> 6 ax = sns.lineplot(x='alpha', 7 y='ic', data=lasso_scores, 8 estimator=np.mean, label='Mean', ax=axes[0]) 9 scores_by_alpha['median'].plot(logx=True, ax=axes[0], label='Median') 11 axes[0].axvline(best_alpha_mean, ls='--', c='k', lw=1, label='Max. Mean')

File c:\Users\cc\miniforge3\envs\ml4t\lib\site-packages\seaborn\relational.py:639, in lineplot(data, x, y, hue, size, style, units, palette, hue_order, hue_norm, sizes, size_order, size_norm, dashes, markers, style_order, estimator, errorbar, n_boot, seed, orient, sort, err_style, err_kws, legend, ci, ax, **kwargs) 636 color = kwargs.pop("color", kwargs.pop("c", None)) 637 kwargs["color"] = _default_color(ax.plot, hue, color, kwargs) --> 639 p.plot(ax, kwargs) 640 return ax

File c:\Users\cc\miniforge3\envs\ml4t\lib\site-packages\seaborn\relational.py:423, in _LinePlotter.plot(self, ax, kws) 415 # TODO How to handle NA? We don't want NA to propagate through to the 416 # estimate/CI when some values are present, but we would also like 417 # matplotlib to show "gaps" in the line when all values are missing. (...) 420 421 # Loop over the semantic subsets and add to the plot 422 grouping_vars = "hue", "size", "style" --> 423 for sub_vars, sub_data in self.iter_data(grouping_vars, from_comp_data=True): 425 if self.sort: 426 sort_vars = ["units", orient, other]

File c:\Users\cc\miniforge3\envs\ml4t\lib\site-packages\seaborn_oldcore.py:1028, in VectorPlotter.iter_data(self, grouping_vars, reverse, from_comp_data, by_facet, allow_empty, dropna) 1023 grouping_vars = [ 1024 var for var in grouping_vars if var in self.variables 1025 ] 1027 if from_comp_data: -> 1028 data = self.comp_data 1029 else: 1030 data = self.plot_data

File c:\Users\cc\miniforge3\envs\ml4t\lib\site-packages\seaborn_oldcore.py:1134, in VectorPlotter.comp_data(self) 1132 else: 1133 comp_col = pd.Series(dtype=float, name=var) -> 1134 comp_data.insert(0, var, comp_col) 1136 self._comp_data = comp_data 1138 return self._comp_data

File c:\Users\cc\miniforge3\envs\ml4t\lib\site-packages\pandas\core\frame.py:4447, in DataFrame.insert(self, loc, column, value, allow_duplicates) 4444 if not isinstance(loc, int): 4445 raise TypeError("loc must be int") -> 4447 value = self._sanitize_column(value) 4448 self._mgr.insert(loc, column, value)

File c:\Users\cc\miniforge3\envs\ml4t\lib\site-packages\pandas\core\frame.py:4535, in DataFrame._sanitize_column(self, value) 4533 # We can get there through loc single_block_path 4534 if isinstance(value, (DataFrame, Series)): -> 4535 return _reindex_for_setitem(value, self.index) 4537 if is_list_like(value): 4538 com.require_length_match(value, self.index)

File c:\Users\cc\miniforge3\envs\ml4t\lib\site-packages\pandas\core\frame.py:11008, in _reindex_for_setitem(value, index) 11004 except ValueError as err: 11005 # raised in MultiIndex.from_tuples, see test_insert_error_msmgs 11006 if not value.index.is_unique: 11007 # duplicate axis

11008 raise err 11010 raise TypeError( 11011 "incompatible index of inserted column with frame index" 11012 ) from err 11013 return reindexed_value

File c:\Users\cc\miniforge3\envs\ml4t\lib\site-packages\pandas\core\frame.py:11003, in _reindex_for_setitem(value, index) 11001 # GH#4107 11002 try:

11003 reindexed_value = value.reindex(index)._values 11004 except ValueError as err: 11005 # raised in MultiIndex.from_tuples, see test_insert_error_msmgs 11006 if not value.index.is_unique: 11007 # duplicate axis

File c:\Users\cc\miniforge3\envs\ml4t\lib\site-packages\pandas\core\series.py:4672, in Series.reindex(self, *args, kwargs) 4668 raise TypeError( 4669 "'index' passed as both positional and keyword argument" 4670 ) 4671 kwargs.update({"index": index}) -> 4672 return super().reindex(kwargs)

File c:\Users\cc\miniforge3\envs\ml4t\lib\site-packages\pandas\core\generic.py:4966, in NDFrame.reindex(self, *args, **kwargs) 4963 return self._reindex_multi(axes, copy, fill_value) 4965 # perform the reindex on the axes -> 4966 return self._reindex_axes( 4967 axes, level, limit, tolerance, method, fill_value, copy 4968 ).finalize(self, method="reindex")

File c:\Users\cc\miniforge3\envs\ml4t\lib\site-packages\pandas\core\generic.py:4986, in NDFrame._reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy) 4981 new_index, indexer = ax.reindex( 4982 labels, level=level, limit=limit, tolerance=tolerance, method=method 4983 ) 4985 axis = self._get_axis_number(a) -> 4986 obj = obj._reindex_with_indexers( 4987 {axis: [new_index, indexer]}, 4988 fill_value=fill_value, 4989 copy=copy, 4990 allow_dups=False, 4991 ) 4992 # If we've made a copy once, no need to make another one 4993 copy = False

File c:\Users\cc\miniforge3\envs\ml4t\lib\site-packages\pandas\core\generic.py:5032, in NDFrame._reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups) 5029 indexer = ensure_platform_int(indexer) 5031 # TODO: speed up on homogeneous DataFrame objects (see _reindex_multi) -> 5032 new_data = new_data.reindex_indexer( 5033 index, 5034 indexer, 5035 axis=baxis, 5036 fill_value=fill_value, 5037 allow_dups=allow_dups, 5038 copy=copy, 5039 ) 5040 # If we've made a copy once, no need to make another one 5041 copy = False

File c:\Users\cc\miniforge3\envs\ml4t\lib\site-packages\pandas\core\internals\managers.py:676, in BaseBlockManager.reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy, consolidate, only_slice, use_na_proxy) 674 # some axes don't allow reindexing with dups 675 if not allow_dups: --> 676 self.axes[axis]._validate_can_reindex(indexer) 678 if axis >= self.ndim: 679 raise IndexError("Requested axis not found in manager")

File c:\Users\cc\miniforge3\envs\ml4t\lib\site-packages\pandas\core\indexes\base.py:4121, in Index._validate_can_reindex(self, indexer) 4119 # trying to reindex on an axis with duplicates 4120 if not self._index_as_unique and len(indexer): -> 4121 raise ValueError("cannot reindex on an axis with duplicate labels")

ValueError: cannot reindex on an axis with duplicate labels

stefan-jansen commented 1 year ago

The error typically points to duplicate values in the index, which you can remove by sth like df=df[~df.index.duplicated].

jytan023 commented 1 year ago

Received the same error and removing duplicated index did not work. Tried lasso_scores2 = lasso_scores.reset_index() and using

ax = sns.lineplot(x='alpha',
y='ic', data=lasso_scores2,
estimator=np.mean, label='Mean', ax=axes[0])

worked. Still have no idea why the error occurred for lasso_scores but not ridge_scores.

XuZekai1129 commented 4 months ago

Just encountered the same problem. Your reindex method works well. But after careful inspection, I find the 'ic' column have NaN values that possibly the reason of the ValueError, I tried running: \ ax = sns.lineplot(x='alpha`, y='ic', data=lasso_scores.dropna(), estimator=np.mean, label='Mean', ax=axes[0]) and it works well.