skorch-dev / skorch

A scikit-learn compatible neural network library that wraps PyTorch
BSD 3-Clause "New" or "Revised" License
5.84k stars 388 forks source link

Error Encountered When Wrapping PyTorch Model with FastText Embeddings: Skorch Internal Code Operates on Raw X Data Instead of Embeddings #1052

Closed amine759 closed 4 months ago

amine759 commented 4 months ago

I'm trying to wrap a pytorch model, the model expects some fasttext embeddings, For my wrapped model I want it to do the embeddings internally for a future purpose. Here is my code :

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import skorch

class FastTextEmbedding(BaseEstimator, TransformerMixin):
    def __init__(self, ftext_model):
        self.ftext_model = ftext_model
        self.label_encoder = LabelEncoder()

    def fit(self, X, y=None):
        self.label_encoder.fit(y)
        return self

    def transform(self, X, y=None):
      embeddings = [self.ftext_model[text] for text in X]
      if y is not None:
          y_encoded = self.label_encoder.transform(y)
          return torch.tensor(embeddings,dtype=torch.float32), torch.tensor(y_encoded, dtype=torch.float32)
      else:
          return torch.tensor(embeddings,dtype=torch.float32)

pipeline = Pipeline([
    ('embedding', FastTextEmbedding(ftext)),
    ('nn', skorch.NeuralNetClassifier(
        BiLSTMClassifier,
        optimizer=torch.optim.Adam,
        lr=best_config['lr'],
        max_epochs=n_epochs
        ))
])
pipeline.fit(train_texts, train_labels)

I get the following error, although I embed my X data, some internal code just tries to work on the X data and not the embeddings returned by transform method I have overread, what am I missing here? any help would be highly appreciated.

TypeError                                 Traceback (most recent call last)
<ipython-input-112-57f21e6419ed> in <cell line: 2>()
      1 get_ipython().run_line_magic('time', '')
----> 2 pipeline.fit(train_texts, train_labels)
      3 #y_proba = pipeline.predict_proba(X)

17 frames
/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    403             if self._final_estimator != "passthrough":
    404                 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 405                 self._final_estimator.fit(Xt, y, **fit_params_last_step)
    406 
    407         return self

/usr/local/lib/python3.10/dist-packages/skorch/classifier.py in fit(self, X, y, **fit_params)
    163         # this is actually a pylint bug:
    164         # https://github.com/PyCQA/pylint/issues/1085
--> 165         return super(NeuralNetClassifier, self).fit(X, y, **fit_params)
    166 
    167     def predict_proba(self, X):

/usr/local/lib/python3.10/dist-packages/skorch/net.py in fit(self, X, y, **fit_params)
   1317             self.initialize()
   1318 
-> 1319         self.partial_fit(X, y, **fit_params)
   1320         return self
   1321 

/usr/local/lib/python3.10/dist-packages/skorch/net.py in partial_fit(self, X, y, classes, **fit_params)
   1276         self.notify('on_train_begin', X=X, y=y)
   1277         try:
-> 1278             self.fit_loop(X, y, **fit_params)
   1279         except KeyboardInterrupt:
   1280             pass

/usr/local/lib/python3.10/dist-packages/skorch/net.py in fit_loop(self, X, y, epochs, **fit_params)
   1188             self.notify('on_epoch_begin', **on_epoch_kwargs)
   1189 
-> 1190             self.run_single_epoch(iterator_train, training=True, prefix="train",
   1191                                   step_fn=self.train_step, **fit_params)
   1192 

/usr/local/lib/python3.10/dist-packages/skorch/net.py in run_single_epoch(self, iterator, training, prefix, step_fn, **fit_params)
   1224         for batch in iterator:
   1225             self.notify("on_batch_begin", batch=batch, training=training)
-> 1226             step = step_fn(batch, **fit_params)
   1227             self.history.record_batch(prefix + "_loss", step["loss"].item())
   1228             batch_size = (get_len(batch[0]) if isinstance(batch, (tuple, list))

/usr/local/lib/python3.10/dist-packages/skorch/net.py in train_step(self, batch, **fit_params)
   1103             return step['loss']
   1104 
-> 1105         self._step_optimizer(step_fn)
   1106         return step_accumulator.get_step()
   1107 

/usr/local/lib/python3.10/dist-packages/skorch/net.py in _step_optimizer(self, step_fn)
   1058                 optimizer.step()
   1059             else:
-> 1060                 optimizer.step(step_fn)
   1061 
   1062     def train_step(self, batch, **fit_params):

/usr/local/lib/python3.10/dist-packages/torch/optim/optimizer.py in wrapper(*args, **kwargs)
    383                             )
    384 
--> 385                 out = func(*args, **kwargs)
    386                 self._optimizer_step_code()
    387 

/usr/local/lib/python3.10/dist-packages/torch/optim/optimizer.py in _use_grad(self, *args, **kwargs)
     74             torch.set_grad_enabled(self.defaults['differentiable'])
     75             torch._dynamo.graph_break()
---> 76             ret = func(self, *args, **kwargs)
     77         finally:
     78             torch._dynamo.graph_break()

/usr/local/lib/python3.10/dist-packages/torch/optim/adam.py in step(self, closure)
    144         if closure is not None:
    145             with torch.enable_grad():
--> 146                 loss = closure()
    147 
    148         for group in self.param_groups:

/usr/local/lib/python3.10/dist-packages/skorch/net.py in step_fn()
   1092         def step_fn():
   1093             self._zero_grad_optimizer()
-> 1094             step = self.train_step_single(batch, **fit_params)
   1095             step_accumulator.store_step(step)
   1096 

/usr/local/lib/python3.10/dist-packages/skorch/net.py in train_step_single(self, batch, **fit_params)
    992         Xi, yi = unpack_data(batch)
    993         y_pred = self.infer(Xi, **fit_params)
--> 994         loss = self.get_loss(y_pred, yi, X=Xi, training=True)
    995         loss.backward()
    996         return {

/usr/local/lib/python3.10/dist-packages/skorch/classifier.py in get_loss(self, y_pred, y_true, *args, **kwargs)
    148             eps = torch.finfo(y_pred.dtype).eps
    149             y_pred = torch.log(y_pred + eps)
--> 150         return super().get_loss(y_pred, y_true, *args, **kwargs)
    151 
    152     # pylint: disable=signature-differs

/usr/local/lib/python3.10/dist-packages/skorch/net.py in get_loss(self, y_pred, y_true, X, training)
   1662 
   1663         """
-> 1664         y_true = to_tensor(y_true, device=self.device)
   1665         return self.criterion_(y_pred, y_true)
   1666 

/usr/local/lib/python3.10/dist-packages/skorch/utils.py in to_tensor(X, device, accept_sparse)
    102         return {key: to_tensor_(val) for key, val in X.items()}
    103     if isinstance(X, (list, tuple)):
--> 104         return [to_tensor_(x) for x in X]
    105     if np.isscalar(X):
    106         return torch.as_tensor(X, device=device)

/usr/local/lib/python3.10/dist-packages/skorch/utils.py in <listcomp>(.0)
    102         return {key: to_tensor_(val) for key, val in X.items()}
    103     if isinstance(X, (list, tuple)):
--> 104         return [to_tensor_(x) for x in X]
    105     if np.isscalar(X):
    106         return torch.as_tensor(X, device=device)

/usr/local/lib/python3.10/dist-packages/skorch/utils.py in to_tensor(X, device, accept_sparse)
    104         return [to_tensor_(x) for x in X]
    105     if np.isscalar(X):
--> 106         return torch.as_tensor(X, device=device)
    107     if isinstance(X, Sequence):
    108         return torch.as_tensor(np.array(X), device=device)

TypeError: new(): invalid data type 'str'
BenjaminBossan commented 4 months ago

So the problem is not you X but your y, as you can see here: y_true = to_tensor(y_true, device=self.device). I assume that your y contains string labels.

Without digging too deep, I think the problem is an assumption in your custom transformer. You think you can transform the y in there too, but transformers are only passed X. If possible, I would recommend to label-encode your y before passing it to the pipeline. If this doesn't work, you could subclass Pipeline and make it label-encode your y, as the pipeline sees both X and y.

amine759 commented 4 months ago

@BenjaminBossan Hi, I actually had to sublass Pipeline, Thanks for the help!