Closed huiwengoh closed 2 years ago
Re ordering: Let's have this example appear right before the current Amazon Reviews FastText one? I feel like this one is really nice now thanks to you, and highly related to that one.
Just for reference, here was the original version of this code back when CleanLearning only supported array datasets:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from cleanlab.classification import CleanLearning
from transformers import AutoTokenizer, TFAutoModel
import os
from sklearn.metrics import accuracy_score
os.environ['TOKENIZERS_PARALLELISM']='false'
def build_model(model_name:str, max_len:int, n_classes:int):
# 1 - Input ids and attention mask as input to NN
input_ids = tf.keras.layers.Input(
shape=(max_len,), dtype='int32', name='input_ids')
attention_mask = tf.keras.layers.Input(
shape=(max_len,), dtype='int32', name='attention_mask')
# 2 - Get bert Main layer and add it to the NN, passing input ids and attention mask as inputs
bert_layer = TFAutoModel.from_pretrained(model_name)
layer = bert_layer(input_ids=input_ids, attention_mask=attention_mask)[1]
# embeddings = tf.keras.layers.Embedding(30_522 + 1, 128)(input_ids)
# flatten = tf.keras.layers.Flatten()(embeddings)
output_layer = tf.keras.layers.Dense(
n_classes, activation='sigmoid')(layer)
# model instance
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output_layer)
model.summary()
return model
MODEL_NAME = "bert-base-uncased"
train = tfds.load('imdb_reviews', split='train', shuffle_files=True)
val = tfds.load('imdb_reviews', split='test', shuffle_files=True)
train_df = tfds.as_dataframe(train)[0:1000]
train_df['text'] = train_df['text'].apply(lambda x: x.decode('utf-8'))
val_df = tfds.as_dataframe(val)[0:1000]
val_df['text'] = val_df['text'].apply(lambda x: x.decode('utf-8'))
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
train_input = tokenizer(
train_df['text'].to_list(),
padding='max_length',
truncation=True,
max_length=20,
return_tensors='tf')
val_input = tokenizer(
val_df['text'].to_list(),
padding='max_length',
truncation=True,
max_length=20,
return_tensors='tf',
return_token_type_ids=False)
early_stopping = tf.keras.callbacks.EarlyStopping(
monitor='val_accuracy', mode='max', verbose=1, patience=3, restore_best_weights=True)
val_dataset = tf.data.Dataset.from_tensor_slices(
(
dict(val_input),
val_df['label'],
)
).batch(32)
model = HuggingfaceKerasClassifier(
# --- model function parameters ---
model=build_model,
model_name=MODEL_NAME,
max_len=20,
n_classes=1,
# --- HuggingfaceKerasClassifier Parameters ---
seq_len=20,
train_input=dict(train_input),
# --- TF training Parameters ---
optimizer=tf.keras.optimizers.Adam(2e-5),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=['accuracy'],
epochs=10,
batch_size=64,
shuffle=True,
callbacks=[early_stopping],
verbose=True
)
print('Model Created.')
lnl = CleanLearning(clf=model, cv_n_folds=3)
print('Training CleanLearning model...')
training_ids = np.array(list(range(len(train_input['input_ids']))))
# Note that the validation dataset provided in the clf_kwargs is not related
# to the validation set used internally by cleanlab. In this case, the validation
# set is required by the early stopping callback used during the trainings.
lnl.fit(training_ids, np.array(train_df['label']), clf_kwargs={'validation_data': val_dataset})
print('Model trained.')
predictions = lnl.predict(test_input = val_input)
print('Accuracy on val data: ', accuracy_score(val_df['label'], predictions))
where the old HuggingfaceKerasClassifier class looked like this:
from typing import Dict
from scikeras.wrappers import KerasClassifier
from sklearn.preprocessing import FunctionTransformer
import tensorflow as tf
import numpy as np
class HuggingfaceKerasClassifier(KerasClassifier):
def __init__(self, train_input: Dict, seq_len: int, **kwargs):
"""
Basic scikeras/scikeras does not directly provide the option of having
a multi-input model, i.e the input must be in the form (num_samples, feature_size).
In our case, the multiple inputs are:
- token ids (input_ids);
- indices specifying what tokens to attend to (attention_mask).
Here we work around the aforementioned problem by adding the `feature_encoder` property to our class
which extends Scikit-learn's BaseEstimator.
Example of use:
* ``model = HuggingfaceKerasClassifier(``
* `` # --- model function parameters ---``
* `` model=model_fn,``
* `` n_classes=2,``
* `` # --- HuggingfaceKerasClassifier Parameters ---``
* `` train_input=dict(train_input),``
* `` seq_len=20,``
* `` # --- TF training Parameters ---``
* `` optimizer=tf.keras.optimizers.Adam(2e-5),``
* `` loss=tf.keras.losses.BinaryCrossentropy(),``
* `` metrics=['accuracy'],``
* `` epochs=10,``
* `` batch_size=64,``
* `` shuffle=True,``
* `` callbacks=[early_stopping],``
* `` verbose=True``
* ``)``
* `` ``
* ``lnl = CleanLearning(clf=model, cv_n_folds=3)``
* ``lnl.fit(training_ids, train_labels, clf_kwargs={'validation_data': val_dataset})``
* ``predictions = lnl.predict(test_input = test_input)``
* ``print('Accuracy on test data: ', (predictions == test_y).sum() / len(test_y))``
References:
- https://towardsdatascience.com/scikeras-tutorial-a-multi-input-multi-output-wrapper-for-capsnet-hyperparameter-tuning-with-keras-3127690f7f28
- https://www.adriangb.com/scikeras/stable/notebooks/DataTransformers.html#4.-Multiple-inputs
Parameters
----------
train_input : dictionary or pandas DataFrame,
Tokenized input data. Must contains the following keys/columns (For a more detailed explanation refers to `Huggingface documentation <https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__>`):
- `input_ids`: a list or tensorflow tensor of token ids to be fed to the model. Shape = (num_sample, sequence_len).
- `attention_mask` a list or tensorflow tensor of indices specifying which tokens should be attended to by the model. Shape = (num_sample, sequence_len).
sequence_len : int,
Tokenized sentences length.
kwargs : optional,
Optional arguments useful to fit a `Tensorflow model`. Refers to Tensorflow documentation for a detailed list of attributes `<https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit>`
"""
super().__init__(**kwargs)
self.train_input = train_input
self.seq_len = seq_len
def split_input(self, X):
splitted_X = [
X[:, : self.seq_len], # input_ids
X[:, self.seq_len :], # attention_mask
]
return splitted_X
@property
def feature_encoder(self):
return FunctionTransformer(
func=self.split_input,
)
def _get_tf_input(self, ids, train_input):
indexed_input_ids = np.array(tf.gather(train_input["input_ids"], indices=ids))
indexed_attention_mask = np.array(tf.gather(train_input["attention_mask"], indices=ids))
return np.hstack([indexed_input_ids, indexed_attention_mask])
def fit(self, ids, y, sample_weight=None, **kwargs):
"""
Constructs and fit a new model using the given data.
Parameters
----------
ids : array-like of shape (n_samples,)
Ids of training samples to be used to train the model.
y : array-like of shape (n_samples,)
True labels.
sample_weight : array-like of shape (n_samples,), default=None
Array of weights that are assigned to individual samples.
If not provided, then each sample is given unit weight.
**kwargs : Dict[str, Any]
Extra arguments to route to Tensorflow `Model.fit`.
"""
X = self._get_tf_input(ids, self.train_input)
return super().fit(X, y, sample_weight=sample_weight, **kwargs)
def predict_proba(self, ids=None, test_input=None, **kwargs):
"""
Returns class probability estimates for a particular set of examples.
Parameters
----------
ids: optional array-like of shape (n_samples,).
Indices of training examples for which to produce predictions.
One of `ids` or `test_input` must be provided.
test_input: optional Dict or pandas Dataframe.
Tokenized test data for which to produce predictions.
Must be of the same format as `train_input`.
One of `ids` or `test_input` must be provided.
**kwargs : Dict[str, Any]
Additional arguments to route to Tensorflow ``Model.predict``.
"""
if ids is not None and test_input is not None:
raise ValueError("One of ids or test_input must be None")
if ids is not None:
X = self._get_tf_input(ids, self.train_input)
elif test_input is not None:
X = self._get_tf_input(np.arange(len(test_input["input_ids"])), test_input)
else:
raise ValueError("Both ids and test_input cannot be None")
return super().predict_proba(X, **kwargs)
def predict(self, ids=None, test_input=None, **kwargs):
"""
Returns predictions for a particular set of examples.
Parameters
----------
ids : array-like of shape (n_samples,).
Indices of training examples for which to produce predictions.
One of `ids` or `test_input` must be provided.
test_input: optional Dict or pandas Dataframe.
Tokenized test data for which to produce predictions.
Must be of the same format as `train_input`.
One of `ids` or `test_input` must be provided.
**kwargs : Dict[str, Any]
Additional arguments to route to Tensorflow `Model.predict`.
"""
if ids is not None and test_input is not None:
raise ValueError("One of ids or test_input must be None")
if ids is not None:
X = self._get_tf_input(ids, self.train_input)
elif test_input is not None:
X = self._get_tf_input(np.arange(len(test_input["input_ids"])), test_input)
else:
raise ValueError("Both ids and test_input cannot be None")
return super().predict(X, **kwargs)
Add notebook showcasing the use of KerasWrapperModel (added in https://github.com/cleanlab/cleanlab/pull/434) that makes Keras models compatible with CleanLearning.
Adapted from the example code of this comment in issue: https://github.com/cleanlab/cleanlab/issues/372