Closed ahmedkotb98 closed 3 years ago
You can find out how to create a data set in the examples or in the testing utils. For unlabeled data, simply use -1
instead of the actual label.
when i update transformers_multiclass_classification.py script
and you can find code and error below:
"""Example of a transformer-based active learning multi class text classification.
"""
import logging
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
from small_text.active_learner import PoolBasedActiveLearner
from small_text.initialization import random_initialization_balanced
from small_text.integrations.transformers import TransformerModelArguments
from small_text.integrations.transformers.classifiers.factories import TransformerBasedClassificationFactory
from small_text.query_strategies import PoolExhaustedException, EmptyPoolException
from small_text.query_strategies import RandomSampling
from examples.data.corpus_twenty_news import get_twenty_newsgroups_corpus
from examples.data.example_data_transformers import preprocess_data
from examples.shared import evaluate
TRANSFORMER_MODEL = TransformerModelArguments('distilroberta-base')
data = pd.read_csv("/content/Descriptions - Sheet1.csv")
data = data[['DescriptionEnglish','ScoreEnglish']]
data = data.sample(frac = 1)
def main():
# Active learning parameters
classifier_kwargs = dict({'device': 'cuda'})
clf_factory = TransformerBasedClassificationFactory(TRANSFORMER_MODEL, kwargs=classifier_kwargs)
query_strategy = RandomSampling()
train = data[:500]
test = data[500:]
tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL.model, cache_dir='.cache/')
x_train = preprocess_data(tokenizer, train['DescriptionEnglish'], train['ScoreEnglish'])
y_train = train['ScoreEnglish']
x_test = preprocess_data(tokenizer, test['DescriptionEnglish'], test['ScoreEnglish'])
# Active learner
active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, x_train)
labeled_indices = initialize_active_learner(active_learner, y_train)
try:
perform_active_learning(active_learner, x_train, labeled_indices, x_test)
except PoolExhaustedException:
print('Error! Not enough samples left to handle the query.')
except EmptyPoolException:
print('Error! No more samples left. (Unlabeled pool is empty)')
def perform_active_learning(active_learner, train, labeled_indices, test):
# Perform 10 iterations of active learning...
for i in range(10):
# ...where each iteration consists of labelling 20 samples
q_indices = active_learner.query(num_samples=20)
# Simulate user interaction here. Replace this for real-world usage.
y = train.y[q_indices]
# Return the labels for the current query to the active learner.
active_learner.update(y)
labeled_indices = np.concatenate([q_indices, labeled_indices])
print('Iteration #{:d} ({} samples)'.format(i, len(labeled_indices)))
evaluate(active_learner, train[labeled_indices], test)
def initialize_active_learner(active_learner, y_train):
x_indices_initial = random_initialization_balanced(y_train)
y_initial = np.array([y_train[i] for i in x_indices_initial])
num_classes = 5
active_learner.initialize_data(x_indices_initial, y_initial)
return x_indices_initial
if __name__ == '__main__':
logging.getLogger('small_text').setLevel(logging.INFO)
logging.getLogger('transformers.modeling_utils').setLevel(logging.ERROR)
main()
errorrrr: Downloading: 100% 480/480 [00:00<00:00, 441kB/s] Downloading: 100% 878k/878k [00:00<00:00, 1.84MB/s] Downloading: 100% 446k/446k [00:00<00:00, 1.12MB/s] Downloading: 100% 1.29M/1.29M [00:00<00:00, 2.71MB/s] Downloading: 100% 480/480 [00:00<00:00, 418kB/s] Traceback (most recent call last): File "/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py", line 2898, in get_loc return self._engine.get_loc(casted_key) File "pandas/_libs/index.pyx", line 70, in pandas._libs.index.IndexEngine.get_loc File "pandas/_libs/index.pyx", line 101, in pandas._libs.index.IndexEngine.get_loc File "pandas/_libs/hashtable_class_helper.pxi", line 1032, in pandas._libs.hashtable.Int64HashTable.get_item File "pandas/_libs/hashtable_class_helper.pxi", line 1039, in pandas._libs.hashtable.Int64HashTable.get_item KeyError: 4
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/content/small-text/examples/transformers_multiclass_classification.py", line 89, in
The error is here:
x_train = preprocess_data(tokenizer, train['DescriptionEnglish'], train['ScoreEnglish'])
Make your you pass the same number of texts and labels here. I cannot inspect your DataFrame from the distance, but if you pass two lists with the same amount of items, it should work. (Also I doubt the underlying function from the transformers library supports Pandas at this point, but I am not sure about this.)
The error is here:
x_train = preprocess_data(tokenizer, train['DescriptionEnglish'], train['ScoreEnglish'])
Make your you pass the same number of texts and labels here. I cannot inspect your DataFrame from the distance, but if you pass two lists with the same amount of items, it should work. (Also I doubt the underlying function from the transformers library supports Pandas at this point, but I am not sure about this.) firstly i did shuffle for dataset and split it to test and train but you mean that number of labels = number of example wit label and not label or number of labels = number of examples of label data only and how i make data labeling for non label values?
texts = ['this is document 1', 'this is another document', 'more text']
labels = [0, 1, -1]
x_train = preprocess_data(tokenizer, texts, labels)
In this example the last document would be unlabeled (because the assigned label is -1
).
texts = ['this is document 1', 'this is another document', 'more text'] labels = [0, 1, -1] x_train = preprocess_data(tokenizer, texts, labels)
In this example the last document would be unlabeled (because the assigned label is
-1
).
same error when i use fillna function with -1
Traceback (most recent call last): File "/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py", line 2898, in get_loc return self._engine.get_loc(casted_key) File "pandas/_libs/index.pyx", line 70, in pandas._libs.index.IndexEngine.get_loc File "pandas/_libs/index.pyx", line 101, in pandas._libs.index.IndexEngine.get_loc File "pandas/_libs/hashtable_class_helper.pxi", line 1032, in pandas._libs.hashtable.Int64HashTable.get_item File "pandas/_libs/hashtable_class_helper.pxi", line 1039, in pandas._libs.hashtable.Int64HashTable.get_item KeyError: 12
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/content/small-text/examples/transformers_multiclass_classification.py", line 90, in
thanks for you now i could turn on it and the last question is after training unlabeled values will be labeled?
after training a while i got an error again in train_labels and my train_labels is not nan , i shouldn't put any value in train_labels with -1?
Traceback (most recent call last):
File "/content/small-text/examples/transformers_multiclass_classification.py", line 101, in
Unfortunately, I cannot reproduce this problem. A label value of -1
means "no label" and is fine in this context. I did the same and did not encounter such errors. So I suspect the error is somewhere in your input data. Try to inspect your data again (for example by using a debugger). The error message clearly says that there is a None value somewhere among your labels.
As a side note: Your pasted code and error messages are hard to read. For this I recommend the use of github markdown for this.
if I have a dataset and part of the data set is labeled and other is not how I can path this dataset to label it