huggingface / setfit

Efficient few-shot learning with Sentence Transformers
https://hf.co/docs/setfit
Apache License 2.0
2.24k stars 222 forks source link

setfit can not get good result for chinese language? #539

Open guangdongliang opened 4 months ago

guangdongliang commented 4 months ago

The number of data labels is 4. the eval loss increase and the train loss decrease. I think it is overfit. Code is as below:

code

from setfit import SetFitModel, Trainer, TrainingArguments, sample_dataset, SetFitModelCardData
import ujson as json
from datasets import load_dataset, DatasetDict, Dataset
from transformers import EarlyStoppingCallback
import datetime

def convert_files_to_dataset(train_path, val_path):
    # 定义一个函数来读取单个文件
    def read_file(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            data = [json.loads(line.strip()) for line in lines]
        return data

    # 读取训练集和验证集
    train_data = read_file(train_path)
    val_data = read_file(val_path)

    # 将数据转换为Dataset
    train_dataset = Dataset.from_list(train_data)
    val_dataset = Dataset.from_list(val_data)

    # 创建DatasetDict
    dataset_dict = DatasetDict({'train': train_dataset, 'validation': val_dataset})

    return dataset_dict

# 使用函数
train_path = 'train_cn.txt'
val_path = 'val_cn.txt'
dataset = convert_files_to_dataset(train_path, val_path)

from setfit import sample_dataset

train_dataset = sample_dataset(dataset["train"], num_samples=50)
print(train_dataset)

eval_dataset = dataset["validation"]
print(eval_dataset)
from setfit import SetFitModel

model = SetFitModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', model_card_data=SetFitModelCardData(
        language=['en', 'de', 'nl'],
    ))
from sentence_transformers.losses import CosineSimilarityLoss

from setfit import SetFitTrainer

trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss_class=CosineSimilarityLoss,
    num_iterations=20,
    num_epochs=5
)
trainer.train()
metrics = trainer.evaluate()
print(metrics)```

log is as below:

Dataset({ features: ['text', 'label'], num_rows: 200 }) Dataset({ features: ['text', 'label'], num_rows: 40 }) /usr/local/matrix/conda3/envs/peft/lib/python3.8/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: resume_download is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use force_download=True. warnings.warn( model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference. b.py:51: DeprecationWarning: SetFitTrainer has been deprecated and will be removed in v2.0.0 of SetFit. Please use Trainer instead. trainer = SetFitTrainer( Using evaluation_strategy="steps" as eval_steps is defined. Map: 100%|████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 12981.44 examples/s] Running training Num unique pairs = 8000 Batch size = 16 Num epochs = 5 Total optimization steps = 2500 0%| | 0/2500 [00:00<?, ?it/s] {'embedding_loss': 0.3173, 'learning_rate': 8e-08, 'epoch': 0.0} | 0/2500 [00:00<?, ?it/s] {'embedding_loss': 0.2875, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.1} {'eval_embedding_loss': 0.236, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.1} {'embedding_loss': 0.2662, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.2} {'eval_embedding_loss': 0.2351, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.2} {'embedding_loss': 0.2741, 'learning_rate': 1.2e-05, 'epoch': 0.3} {'eval_embedding_loss': 0.2319, 'learning_rate': 1.2e-05, 'epoch': 0.3} {'embedding_loss': 0.2747, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.4} {'eval_embedding_loss': 0.2331, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.4} {'embedding_loss': 0.196, 'learning_rate': 2e-05, 'epoch': 0.5} {'eval_embedding_loss': 0.2297, 'learning_rate': 2e-05, 'epoch': 0.5} {'embedding_loss': 0.1512, 'learning_rate': 1.9555555555555557e-05, 'epoch': 0.6} {'eval_embedding_loss': 0.2387, 'learning_rate': 1.9555555555555557e-05, 'epoch': 0.6} {'embedding_loss': 0.0866, 'learning_rate': 1.9111111111111113e-05, 'epoch': 0.7} {'eval_embedding_loss': 0.248, 'learning_rate': 1.9111111111111113e-05, 'epoch': 0.7} {'embedding_loss': 0.0437, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.8} {'eval_embedding_loss': 0.2427, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.8} {'embedding_loss': 0.07, 'learning_rate': 1.8222222222222224e-05, 'epoch': 0.9} {'eval_embedding_loss': 0.2474, 'learning_rate': 1.8222222222222224e-05, 'epoch': 0.9} {'embedding_loss': 0.0332, 'learning_rate': 1.7777777777777777e-05, 'epoch': 1.0} {'eval_embedding_loss': 0.2587, 'learning_rate': 1.7777777777777777e-05, 'epoch': 1.0} {'embedding_loss': 0.0125, 'learning_rate': 1.7333333333333336e-05, 'epoch': 1.1} {'eval_embedding_loss': 0.2573, 'learning_rate': 1.7333333333333336e-05, 'epoch': 1.1} {'embedding_loss': 0.0023, 'learning_rate': 1.688888888888889e-05, 'epoch': 1.2} {'eval_embedding_loss': 0.2648, 'learning_rate': 1.688888888888889e-05, 'epoch': 1.2} {'embedding_loss': 0.0033, 'learning_rate': 1.6444444444444444e-05, 'epoch': 1.3} {'eval_embedding_loss': 0.2659, 'learning_rate': 1.6444444444444444e-05, 'epoch': 1.3} {'embedding_loss': 0.0011, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.4} {'eval_embedding_loss': 0.2692, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.4} {'embedding_loss': 0.0007, 'learning_rate': 1.555555555555556e-05, 'epoch': 1.5} {'eval_embedding_loss': 0.2687, 'learning_rate': 1.555555555555556e-05, 'epoch': 1.5} {'embedding_loss': 0.001, 'learning_rate': 1.5111111111111112e-05, 'epoch': 1.6} {'eval_embedding_loss': 0.2739, 'learning_rate': 1.5111111111111112e-05, 'epoch': 1.6} {'embedding_loss': 0.0012, 'learning_rate': 1.4666666666666666e-05, 'epoch': 1.7} {'eval_embedding_loss': 0.2707, 'learning_rate': 1.4666666666666666e-05, 'epoch': 1.7} {'embedding_loss': 0.0005, 'learning_rate': 1.4222222222222224e-05, 'epoch': 1.8} {'eval_embedding_loss': 0.2684, 'learning_rate': 1.4222222222222224e-05, 'epoch': 1.8} {'embedding_loss': 0.0006, 'learning_rate': 1.377777777777778e-05, 'epoch': 1.9} {'eval_embedding_loss': 0.2756, 'learning_rate': 1.377777777777778e-05, 'epoch': 1.9} {'embedding_loss': 0.0003, 'learning_rate': 1.3333333333333333e-05, 'epoch': 2.0} {'eval_embedding_loss': 0.2698, 'learning_rate': 1.3333333333333333e-05, 'epoch': 2.0} {'embedding_loss': 0.0007, 'learning_rate': 1.288888888888889e-05, 'epoch': 2.1} {'eval_embedding_loss': 0.2745, 'learning_rate': 1.288888888888889e-05, 'epoch': 2.1} {'embedding_loss': 0.0004, 'learning_rate': 1.2444444444444446e-05, 'epoch': 2.2} {'eval_embedding_loss': 0.2771, 'learning_rate': 1.2444444444444446e-05, 'epoch': 2.2} {'embedding_loss': 0.0005, 'learning_rate': 1.2e-05, 'epoch': 2.3} {'eval_embedding_loss': 0.2742, 'learning_rate': 1.2e-05, 'epoch': 2.3} {'embedding_loss': 0.0007, 'learning_rate': 1.1555555555555556e-05, 'epoch': 2.4} {'eval_embedding_loss': 0.2719, 'learning_rate': 1.1555555555555556e-05, 'epoch': 2.4} {'embedding_loss': 0.0002, 'learning_rate': 1.1111111111111113e-05, 'epoch': 2.5} {'eval_embedding_loss': 0.2782, 'learning_rate': 1.1111111111111113e-05, 'epoch': 2.5} {'embedding_loss': 0.0002, 'learning_rate': 1.0666666666666667e-05, 'epoch': 2.6} {'eval_embedding_loss': 0.2721, 'learning_rate': 1.0666666666666667e-05, 'epoch': 2.6} {'embedding_loss': 0.0002, 'learning_rate': 1.0222222222222223e-05, 'epoch': 2.7} {'eval_embedding_loss': 0.2743, 'learning_rate': 1.0222222222222223e-05, 'epoch': 2.7} {'embedding_loss': 0.0003, 'learning_rate': 9.777777777777779e-06, 'epoch': 2.8} {'eval_embedding_loss': 0.2822, 'learning_rate': 9.777777777777779e-06, 'epoch': 2.8} {'embedding_loss': 0.0003, 'learning_rate': 9.333333333333334e-06, 'epoch': 2.9} {'eval_embedding_loss': 0.2758, 'learning_rate': 9.333333333333334e-06, 'epoch': 2.9} {'embedding_loss': 0.0004, 'learning_rate': 8.888888888888888e-06, 'epoch': 3.0} {'eval_embedding_loss': 0.2764, 'learning_rate': 8.888888888888888e-06, 'epoch': 3.0} {'embedding_loss': 0.0004, 'learning_rate': 8.444444444444446e-06, 'epoch': 3.1} {'eval_embedding_loss': 0.2798, 'learning_rate': 8.444444444444446e-06, 'epoch': 3.1} {'embedding_loss': 0.0002, 'learning_rate': 8.000000000000001e-06, 'epoch': 3.2} {'eval_embedding_loss': 0.2769, 'learning_rate': 8.000000000000001e-06, 'epoch': 3.2} {'embedding_loss': 0.0004, 'learning_rate': 7.555555555555556e-06, 'epoch': 3.3} {'eval_embedding_loss': 0.2766, 'learning_rate': 7.555555555555556e-06, 'epoch': 3.3} {'embedding_loss': 0.0002, 'learning_rate': 7.111111111111112e-06, 'epoch': 3.4} {'eval_embedding_loss': 0.2833, 'learning_rate': 7.111111111111112e-06, 'epoch': 3.4} {'embedding_loss': 0.0002, 'learning_rate': 6.666666666666667e-06, 'epoch': 3.5} {'eval_embedding_loss': 0.2755, 'learning_rate': 6.666666666666667e-06, 'epoch': 3.5}```

kgourgou commented 4 months ago

I would decrease the num_iterations parameter to 5 and see what is the behaviour there.