Error in dimensions when using InteractionTransformer

ghost commented 3 years ago

Dear jlevy44,

I wanted to use the InteractionTransformer in combination with the XGBClassifier. Following your demo on GitHub, I run:

from xgboost import XGBClassifier transformer=InteractionTransformer(untrained_model=XGBClassifier(random_state=42, tree_method='hist'),max_train_test_samples=1000,mode_interaction_extract=int(np.sqrt(X_train.shape[1]))) transformer.fit(X_train,y_train)

Where my X_train and y_train are dataframes with shape (700000,39) and (700000,1), respectively.

I get the following error: --------------------------------------------------------------------------------- ValueError Traceback (most recent call last) C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_blocks(blocks, axes) 1661 blocks = [ -> 1662 make_block(values=blocks[0], placement=slice(0, len(axes[0]))) 1663 ]

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in make_block(values, placement, klass, ndim, dtype) 2721 -> 2722 return klass(values, ndim=ndim, placement=placement) 2723

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in init(self, values, placement, ndim) 129 if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values): --> 130 raise ValueError( 131 f"Wrong number of items passed {len(self.values)}, "

ValueError: Wrong number of items passed 1, placement implies 39

During handling of the above exception, another exception occurred:

ValueError Traceback (most recent call last)

in 1 from xgboost import XGBClassifier 2 transformer=InteractionTransformer(untrained_model=XGBClassifier(random_state=42),max_train_test_samples=1000,mode_interaction_extract=int(np.sqrt(X_train.shape[1]))) # mode_interaction_extract='sqrt' ----> 3 transformer.fit(X_train,y_train) ~\InteractionTransformer.py in fit(self, X, y) 204 # import pickle 205 # pickle.dump(shap_vals,open('shap_test.pkl','wb')) --> 206 true_top_interactions=self.get_top_interactions(shap_vals) 207 #print(true_top_interactions) 208 self.design_terms='+'.join((np.core.defchararray.add(np.vectorize(lambda x: "Q('{}')*".format(x))(true_top_interactions.iloc[:,0]),np.vectorize(lambda x: "Q('{}')".format(x))(true_top_interactions.iloc[:,1]))).tolist()) ~\InteractionTransformer.py in get_top_interactions(self, shap_vals) 223 224 """ --> 225 interaction_matrix=pd.DataFrame(shap_vals.mean(0),columns=self.features,index=self.features)#reduce(lambda x,y:x+y,shap_vals)/len(shap_vals) 226 interation_matrix_self_interact_removed=interaction_matrix.copy() 227 if not self.self_interactions: C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy) 495 mgr = init_dict({data.name: data}, index, columns, dtype=dtype) 496 else: --> 497 mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) 498 499 # For data is list-like, or Iterable (will consume into list) C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in init_ndarray(values, index, columns, dtype, copy) 232 block_values = [values] 233 --> 234 return create_block_manager_from_blocks(block_values, [columns, index]) 235 236 C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_blocks(blocks, axes) 1670 blocks = [getattr(b, "values", b) for b in blocks] 1671 tot_items = sum(b.shape[0] for b in blocks) -> 1672 raise construction_error(tot_items, blocks[0].shape[1:], axes, e) 1673 1674 ValueError: Shape of passed values is (39, 1), indices imply (39, 39) **---------------------------------------------------------------------------------** I then tried it with the data provided in your demo and everything worked fine. Do you know what could possibly go wrong? Thanks in advance, Hassan

jlevy44 commented 3 years ago

Have you tried installing the package directly from github?

pip install git+https://github.com/jlevy44/InteractionTransformer

ghost commented 3 years ago

I downloaded the script from your GitHub and imported it into my notebook. If I install it directly from the GitHub I get the following error:

PicklingError Traceback (most recent call last)

in 1 transformer=InteractionTransformer(untrained_model=XGBClassifier(random_state=42),max_train_test_samples=1000,mode_interaction_extract=int(np.sqrt(X_train.shape[1]))) # mode_interaction_extract='sqrt' ----> 2 transformer.fit(X_train,y_train) ~\InteractionTransformer.py in fit(self, X, y) C:\ProgramData\Anaconda3\lib\site-packages\dask\base.py in compute(*args, **kwargs) 450 postcomputes.append(x.__dask_postcompute__()) 451 --> 452 results = schedule(dsk, keys, **kwargs) 453 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)]) 454 C:\ProgramData\Anaconda3\lib\site-packages\dask\multiprocessing.py in get(dsk, keys, num_workers, func_loads, func_dumps, optimize_graph, pool, **kwargs) 216 try: 217 # Run --> 218 result = get_async( 219 pool.apply_async, 220 len(pool._pool), C:\ProgramData\Anaconda3\lib\site-packages\dask\local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs) 484 _execute_task(task, data) # Re-execute locally 485 else: --> 486 raise_exception(exc, tb) 487 res, worker_id = loads(res_info) 488 state["cache"][key] = res C:\ProgramData\Anaconda3\lib\site-packages\dask\local.py in reraise(exc, tb) 314 if exc.__traceback__ is not tb: 315 raise exc.with_traceback(tb) --> 316 raise exc 317 318 C:\ProgramData\Anaconda3\lib\site-packages\dask\multiprocessing.py in pack_exception() 124 tb = _pack_traceback(exc_traceback) 125 try: --> 126 result = dumps((e, tb)) 127 except BaseException as e: 128 exc_type, exc_value, exc_traceback = sys.exc_info() C:\ProgramData\Anaconda3\lib\site-packages\cloudpickle\cloudpickle_fast.py in dumps() 71 file, protocol=protocol, buffer_callback=buffer_callback 72 ) ---> 73 cp.dump(obj) 74 return file.getvalue() 75 C:\ProgramData\Anaconda3\lib\site-packages\cloudpickle\cloudpickle_fast.py in dump() 561 def dump(self, obj): 562 try: --> 563 return Pickler.dump(self, obj) 564 except RuntimeError as e: 565 if "recursion" in e.args[0]: PicklingError: Can't pickle : import of module 'xgboost.libpath' failed I find this strange, as I already imported xgboost before I run the InteractionTransformer

jlevy44 commented 3 years ago

I believe this could be due to the version of XGBoost that has issues with pickling the model. I believe downgrading XGBoost to 1.0.0 works if my memory serves me or alternatively specifying "single-threaded" for the dask_scheduler option, which should not pickle XGBoost but will take longer to run.

ghost commented 3 years ago

I am using the 1.0.0 version of XGBoost. I solved the pickle error, basically a file was missing. However, after running it with (and without) direct installation of the package directly from GitHub, I run again into the first error (Shape of passed values is (39, 1), indices imply (39, 39))

jlevy44 commented 3 years ago

Hmm.. this should not happen. Are you running a multiclass problem? Before updating the package, can you also try deleting the "dist" directory if you are cloning the repo? This can cause issues.

ghost commented 3 years ago

It is a binary problem. I implemented your suggestion and it still does not run.

ghost commented 3 years ago

I experimented a bit. The code runs without problems for regression problems, but the aforementioned error occurs when a user opts for a binary problem

jvdboogaard commented 3 years ago

I have the same issue as the above. Did you already find the solution to the problem @jlevy44 ?

I have also tried it with your PythonDemo (example), but I get the same error. My code is as follows:

Which gives me the following error:

Shap Interaction Size: (240, 55)

ValueError Traceback (most recent call last) ~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/internals/managers.py in create_block_manager_from_blocks(blocks, axes) 1670 blocks = [ -> 1671 make_block(values=blocks[0], placement=slice(0, len(axes[0]))) 1672 ]

~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/internals/blocks.py in make_block(values, placement, klass, ndim, dtype) 2743 -> 2744 return klass(values, ndim=ndim, placement=placement) 2745

~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/internals/blocks.py in init(self, values, placement, ndim) 130 raise ValueError( --> 131 f"Wrong number of items passed {len(self.values)}, " 132 f"placement implies {len(self.mgr_locs)}"

ValueError: Wrong number of items passed 1, placement implies 55

During handling of the above exception, another exception occurred:

ValueError Traceback (most recent call last)

in 1 from xgboost import XGBClassifier 2 transformer=InteractionTransformer(untrained_model=XGBClassifier(random_state=42),max_train_test_samples=1000,mode_interaction_extract=int(np.sqrt(X_train.shape[1]))) # mode_interaction_extract='sqrt' ----> 3 transformer.fit(X_train,y_train) ~/anaconda3/envs/python3/lib/python3.6/site-packages/interactiontransformer/InteractionTransformer.py in fit(self, X, y) 204 # import pickle 205 # pickle.dump(shap_vals,open('shap_test.pkl','wb')) --> 206 true_top_interactions=self.get_top_interactions(shap_vals) 207 #print(true_top_interactions) 208 self.design_terms='+'.join((np.core.defchararray.add(np.vectorize(lambda x: "Q('{}')*".format(x))(true_top_interactions.iloc[:,0]),np.vectorize(lambda x: "Q('{}')".format(x))(true_top_interactions.iloc[:,1]))).tolist()) ~/anaconda3/envs/python3/lib/python3.6/site-packages/interactiontransformer/InteractionTransformer.py in get_top_interactions(self, shap_vals) 223 224 """ --> 225 interaction_matrix=pd.DataFrame(shap_vals.mean(0),columns=self.features,index=self.features)#reduce(lambda x,y:x+y,shap_vals)/len(shap_vals) 226 interation_matrix_self_interact_removed=interaction_matrix.copy() 227 if not self.self_interactions: ~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy) 495 mgr = init_dict({data.name: data}, index, columns, dtype=dtype) 496 else: --> 497 mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) 498 499 # For data is list-like, or Iterable (will consume into list) ~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/internals/construction.py in init_ndarray(values, index, columns, dtype, copy) 232 block_values = [values] 233 --> 234 return create_block_manager_from_blocks(block_values, [columns, index]) 235 236 ~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/internals/managers.py in create_block_manager_from_blocks(blocks, axes) 1679 blocks = [getattr(b, "values", b) for b in blocks] 1680 tot_items = sum(b.shape[0] for b in blocks) -> 1681 raise construction_error(tot_items, blocks[0].shape[1:], axes, e) 1682 1683 ValueError: Shape of passed values is (55, 1), indices imply (55, 55) Can you please help me with this error. I am writing my thesis for the master quantitative finance (in the Netherlands) and I am using the interaction transformer from your paper as part of my research. I am very interested to know whether it works on my data set (credit scoring), but I can't fix this issue. Wkr, Jeroen

jlevy44 commented 2 years ago

Hi all, were you able to work through these issues? Forking the repo and testing? I can get to this soon if there is still need.

jlevy44 / InteractionTransformer

Error in dimensions when using InteractionTransformer #5

Shap Interaction Size: (240, 55)