Closed loretoparisi closed 2 years ago
Could be this issue related to this SF answer?
The dataset looks like
Dataset({
features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
num_rows: 8256315
})
and features
{'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
'label': ClassLabel(num_classes=403, names=['cmn', 'deu', 'rus', 'fra', 'eng', 'jpn', 'spa', 'ita', 'kor', 'vie', 'nld', 'epo', 'por', 'tur', 'heb', 'hun', 'ell', 'ind', 'ara', 'arz', 'fin', 'bul', 'yue', 'swe', 'ukr', 'bel', 'que', 'ces', 'swh', 'nno', 'wuu', 'nob', 'zsm', 'est', 'kat', 'pol', 'lat', 'urd', 'sqi', 'isl', 'fry', 'afr', 'ron', 'fao', 'san', 'bre', 'tat', 'yid', 'uig', 'uzb', 'srp', 'qya', 'dan', 'pes', 'slk', 'eus', 'cycl', 'acm', 'tgl', 'lvs', 'kaz', 'hye', 'hin', 'lit', 'ben', 'cat', 'bos', 'hrv', 'tha', 'orv', 'cha', 'mon', 'lzh', 'scn', 'gle', 'mkd', 'slv', 'frm', 'glg', 'vol', 'ain', 'jbo', 'tok', 'ina', 'nds', 'mal', 'tlh', 'roh', 'ltz', 'oss', 'ido', 'gla', 'mlt', 'sco', 'ast', 'jav', 'oci', 'ile', 'ota', 'xal', 'tel', 'sjn', 'nov', 'khm', 'tpi', 'ang', 'aze', 'tgk', 'tuk', 'chv', 'hsb', 'dsb', 'bod', 'sme', 'cym', 'mri', 'ksh', 'kmr', 'ewe', 'kab', 'ber', 'tpw', 'udm', 'lld', 'pms', 'lad', 'grn', 'mlg', 'xho', 'pnb', 'grc', 'hat', 'lao', 'npi', 'cor', 'nah', 'avk', 'mar', 'guj', 'pan', 'kir', 'myv', 'prg', 'sux', 'crs', 'ckt', 'bak', 'zlm', 'hil', 'cbk', 'chr', 'nav', 'lkt', 'enm', 'arq', 'lin', 'abk', 'pcd', 'rom', 'gsw', 'tam', 'zul', 'awa', 'wln', 'amh', 'bar', 'hbo', 'mhr', 'bho', 'mrj', 'ckb', 'osx', 'pfl', 'mgm', 'sna', 'mah', 'hau', 'kan', 'nog', 'sin', 'glv', 'dng', 'kal', 'liv', 'vro', 'apc', 'jdt', 'fur', 'che', 'haw', 'yor', 'crh', 'pdc', 'ppl', 'kin', 'shs', 'mnw', 'tet', 'sah', 'kum', 'ngt', 'nya', 'pus', 'hif', 'mya', 'moh', 'wol', 'tir', 'ton', 'lzz', 'oar', 'lug', 'brx', 'non', 'mww', 'hak', 'nlv', 'ngu', 'bua', 'aym', 'vec', 'ibo', 'tkl', 'bam', 'kha', 'ceb', 'lou', 'fuc', 'smo', 'gag', 'lfn', 'arg', 'umb', 'tyv', 'kjh', 'oji', 'cyo', 'urh', 'kzj', 'pam', 'srd', 'lmo', 'swg', 'mdf', 'gil', 'snd', 'tso', 'sot', 'zza', 'tsn', 'pau', 'som', 'egl', 'ady', 'asm', 'ori', 'dtp', 'cho', 'max', 'kam', 'niu', 'sag', 'ilo', 'kaa', 'fuv', 'nch', 'hoc', 'iba', 'gbm', 'sun', 'war', 'mvv', 'pap', 'ary', 'kxi', 'csb', 'pag', 'cos', 'rif', 'kek', 'krc', 'aii', 'ban', 'ssw', 'tvl', 'mfe', 'tah', 'bvy', 'bcl', 'hnj', 'nau', 'nst', 'afb', 'quc', 'min', 'tmw', 'mad', 'bjn', 'mai', 'cjy', 'got', 'hsn', 'gan', 'tzl', 'dws', 'ldn', 'afh', 'sgs', 'krl', 'vep', 'rue', 'tly', 'mic', 'ext', 'izh', 'sma', 'jam', 'cmo', 'mwl', 'kpv', 'koi', 'bis', 'ike', 'run', 'evn', 'ryu', 'mnc', 'aoz', 'otk', 'kas', 'aln', 'akl', 'yua', 'shy', 'fkv', 'gos', 'fij', 'thv', 'zgh', 'gcf', 'cay', 'xmf', 'tig', 'div', 'lij', 'rap', 'hrx', 'cpi', 'tts', 'gaa', 'tmr', 'iii', 'ltg', 'bzt', 'syc', 'emx', 'gom', 'chg', 'osp', 'stq', 'frr', 'fro', 'nys', 'toi', 'new', 'phn', 'jpa', 'rel', 'drt', 'chn', 'pli', 'laa', 'bal', 'hdn', 'hax', 'mik', 'ajp', 'xqa', 'pal', 'crk', 'mni', 'lut', 'ayl', 'ood', 'sdh', 'ofs', 'nus', 'kiu', 'diq', 'qxq', 'alt', 'bfz', 'klj', 'mus', 'srn', 'guc', 'lim', 'zea', 'shi', 'mnr', 'bom', 'sat', 'szl'], id=None),
'text': Value(dtype='string', id=None),
'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}
[UPDATE] I have changed the tokenize function like
def tokenize_function(batch):
tokens = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=128)
tokens['label'] = features["label"].str2int(batch['label'])
return tokens
tokenized_datasets = sentences.map(tokenize_function, batched=True)
and removed the mapping as defined above, but now I'm facing a None
label issue:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
[<ipython-input-39-3f04e6ec6f6e>](https://localhost:8080/#) in <module>()
14 tokens['label'] = features["label"].str2int(batch['label']) if batch["label"] is not None else None
15 return tokens
---> 16 tokenized_datasets = sentences.map(tokenize, batched=True)
10 frames
[/usr/local/lib/python3.7/dist-packages/datasets/features/features.py](https://localhost:8080/#) in str2int(self, values)
852 if value not in self._str2int:
853 value = str(value).strip()
--> 854 output.append(self._str2int[str(value)])
855 else:
856 # No names provided, try to integerize
KeyError: 'None'
Solved filtering None
rows
sentences = sentences.filter(lambda example: example['label'] is not None and example['text'] is not None)
and slightly changing the tokenizer
from datasets import load_dataset,Features,Value,ClassLabel
class_names = ["cmn","deu","rus","fra","eng","jpn","spa","ita","kor","vie","nld","epo","por","tur","heb","hun","ell","ind","ara","arz","fin","bul","yue","swe","ukr","bel","que","ces","swh","nno","wuu","nob","zsm","est","kat","pol","lat","urd","sqi","isl","fry","afr","ron","fao","san","bre","tat","yid","uig","uzb","srp","qya","dan","pes","slk","eus","cycl","acm","tgl","lvs","kaz","hye","hin","lit","ben","cat","bos","hrv","tha","orv","cha","mon","lzh","scn","gle","mkd","slv","frm","glg","vol","ain","jbo","tok","ina","nds","mal","tlh","roh","ltz","oss","ido","gla","mlt","sco","ast","jav","oci","ile","ota","xal","tel","sjn","nov","khm","tpi","ang","aze","tgk","tuk","chv","hsb","dsb","bod","sme","cym","mri","ksh","kmr","ewe","kab","ber","tpw","udm","lld","pms","lad","grn","mlg","xho","pnb","grc","hat","lao","npi","cor","nah","avk","mar","guj","pan","kir","myv","prg","sux","crs","ckt","bak","zlm","hil","cbk","chr","nav","lkt","enm","arq","lin","abk","pcd","rom","gsw","tam","zul","awa","wln","amh","bar","hbo","mhr","bho","mrj","ckb","osx","pfl","mgm","sna","mah","hau","kan","nog","sin","glv","dng","kal","liv","vro","apc","jdt","fur","che","haw","yor","crh","pdc","ppl","kin","shs","mnw","tet","sah","kum","ngt","nya","pus","hif","mya","moh","wol","tir","ton","lzz","oar","lug","brx","non","mww","hak","nlv","ngu","bua","aym","vec","ibo","tkl","bam","kha","ceb","lou","fuc","smo","gag","lfn","arg","umb","tyv","kjh","oji","cyo","urh","kzj","pam","srd","lmo","swg","mdf","gil","snd","tso","sot","zza","tsn","pau","som","egl","ady","asm","ori","dtp","cho","max","kam","niu","sag","ilo","kaa","fuv","nch","hoc","iba","gbm","sun","war","mvv","pap","ary","kxi","csb","pag","cos","rif","kek","krc","aii","ban","ssw","tvl","mfe","tah","bvy","bcl","hnj","nau","nst","afb","quc","min","tmw","mad","bjn","mai","cjy","got","hsn","gan","tzl","dws","ldn","afh","sgs","krl","vep","rue","tly","mic","ext","izh","sma","jam","cmo","mwl","kpv","koi","bis","ike","run","evn","ryu","mnc","aoz","otk","kas","aln","akl","yua","shy","fkv","gos","fij","thv","zgh","gcf","cay","xmf","tig","div","lij","rap","hrx","cpi","tts","gaa","tmr","iii","ltg","bzt","syc","emx","gom","chg","osp","stq","frr","fro","nys","toi","new","phn","jpa","rel","drt","chn","pli","laa","bal","hdn","hax","mik","ajp","xqa","pal","crk","mni","lut","ayl","ood","sdh","ofs","nus","kiu","diq","qxq","alt","bfz","klj","mus","srn","guc","lim","zea","shi","mnr","bom","sat","szl"]
features = Features({ 'label': ClassLabel(names=class_names), 'text': Value('string')})
num_labels = features['label'].num_classes
data_files = { "train": "train.csv", "test": "test.csv" }
sentences = load_dataset(
"loretoparisi/tatoeba-sentences",
data_files=data_files,
delimiter='\t',
column_names=['label', 'text'],
download_mode="force_redownload")
sentences = sentences.filter(lambda example: example['label'] is not None and example['text'] is not None)
sentences = sentences.shuffle()
from transformers import AutoTokenizer
model_name = 'microsoft/xtremedistil-l6-h256-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize(batch):
tokens = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=128)
tokens['label'] = features["label"].str2int(batch['label'])
return tokens
tokenized_datasets = sentences.map(tokenize, batched=True)
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model = model.to(device)
import numpy as np
from datasets import load_metric
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
print(eval_pred)
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
from transformers import TrainingArguments
training_args = TrainingArguments("test_trainer",
per_device_train_batch_size=128,
num_train_epochs=24,learning_rate=3e-05,
evaluation_strategy="epoch")
from transformers import Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=full_train_dataset,
eval_dataset=full_eval_dataset,
compute_metrics=compute_metrics,
)
System Info
Who can help?
@lys
Information
Tasks
examples
folder (such as GLUE/SQuAD, ...)Reproduction
Stack trace:
Expected behavior