Closed mbkrafft closed 3 years ago
Hi,
Where did you set the metric?, could you share the configs you used? I just tried a glue task with micro-f1, and it runs fine here.
Thanks for answering. It's for a custom dataset and I'm trying to use macro-f1, but couldn't get it to work for either metrics.
Dataset config:
{
"NoReC_fine": {
"train_data_path": "../MASTER/Code/data/datasets/no/NoReC_fine/conll/train.conll",
"validation_data_path": "../MASTER/Code/data/datasets/no/NoReC_fine/conll/dev.conll",
"word_idx": 0,
"tasks": {
"agg_label": {
"task_type": "classification",
"column_idx": -1
},
"target": {
"task_type": "seq_bio",
"column_idx": 2
},
"polar_expression": {
"task_type": "seq_bio",
"column_idx": 3
}
}
},
"NoReC_neg": {
"train_data_path": "../MASTER/Code/data/datasets/no/NoReC_neg/conll/train.conll",
"validation_data_path": "../MASTER/Code/data/datasets/no/NoReC_neg/conll/dev.conll",
"word_idx": 0,
"tasks": {
"scope": {
"task_type": "seq_bio",
"column_idx": 1
},
"cue": {
"task_type": "seq_bio",
"column_idx": 2
}
}
}
}
Params config:
{
"dataset_reader": {
"type": "machamp_universal_reader",
"target_max_tokens": max_len,
"source_max_tokens": max_len,
"do_lowercase": false,
"token_indexers": {
"tokens": {
"type": "pretrained_transformer_mixmatched",
"max_length": max_len,
"model_name": transformer_model
}
},
"tokenizer": {
"type": "pretrained_transformer",
"add_special_tokens": false,
"model_name": transformer_model
},
"target_token_indexers": {
"tokens": {
"namespace": "target_words"
}
}
},
"vocabulary": {
"max_vocab_size": {"target_words": 32000},
"min_count": {
"source_words": 1,
"target_words": 1
}
},
"model": {
"type": "machamp_model",
"dataset_embeds_dim": 0,
"decoders": {
"classification": {
"type": "machamp_sentence_decoder",
"metric": "micro-f1"
},
"default": {
"input_dim": transformer_dim,
"loss_weight": 1,
"order": 1
},
"dependency": {
"type": "machamp_dependency_decoder",
"arc_representation_dim": transformer_dim,
"tag_representation_dim": 256,
"use_mst_decoding_for_validation": true
},
"multiseq": {
"type": "machamp_multiseq_decoder",
"metric": "multi_span_f1"
},
"seq": {
"type": "machamp_tag_decoder"
},
"seq2seq": {
"type": "machamp_seq2seq_decoder",
"attention": "dot_product",
"beam_size": 6,
"max_decoding_steps": 128,
"target_decoder_layers": 2,
"target_embedding_dim": 512,
"use_bleu": true
},
"seq_bio": {
"type": "machamp_crf_decoder",
"metric": "span_f1"
},
"string2string": {
"type": "machamp_tag_decoder"
},
"mlm": {
"type": "machamp_mlm_decoder",
"pretrained_model": transformer_model
}
},
"default_max_sents": 0,
"dropout": 0.3,
"encoder": {
"type": "cls_pooler",
"cls_is_last_token": false,
"embedding_dim": transformer_dim
},
"text_field_embedder": {
"type": "basic",
"token_embedders": {
"tokens": {
"type": "machamp_pretrained_transformer_mismatched",
"last_layer_only": true,
"max_length": max_len,
"model_name": transformer_model,
"train_parameters": true
}
}
}
},
"data_loader": {
"batch_sampler": {
"type": "dataset_buckets",
"max_tokens": 1024,
"batch_size": 32,
"sampling_smoothing": 1.0, //1.0 == original size
"sorting_keys": [
"tokens"
]
}
},
"trainer": {
"checkpointer": {
"num_serialized_models_to_keep": 1
},
//"use_amp": true, // could save some memory on gpu
"grad_norm": 1,
"learning_rate_scheduler": {
"type": "slanted_triangular",
"cut_frac": 0.2,
"decay_factor": 0.38,
"discriminative_fine_tuning": true,
"gradual_unfreezing": true
},
"num_epochs": 3,
"optimizer": {
"type": "huggingface_adamw",
"betas": [0.9, 0.99],
"correct_bias": false,
"lr": 0.0001,
"parameter_groups": [
[
[
"^_text_field_embedder.*.transformer_model.*"
],
{}
],
[
[
"^decoders.*",
"dataset_embedder.*"
],
{}
]
],
"weight_decay": 0.01
},
//"patience": 5, // disabled, because slanted_triangular changes the lr dynamically
"validation_metric": "+.run/.sum"
},
"datasets_for_vocab_creation": [
"train",
"validation"//TODO can this be removed now that we add padding/unknown?
]
}
Dataset 1 example:
# sent_id: 104987-17-02
# text: Den er jammen like briljant underholdende også .
# polarity: Positive
# agg_label: Strong Positive
Den O B-Target O
er O O O
jammen O O O
like O O O
briljant O O B-Polar_expression
underholdende O O I-Polar_expression
også O O O
. O O O
Dataset 2 example:
# sent_id: 002051-02-02
# text: Bare synd at musikken de spiller er så ribbet for personlighet .
Bare O O
synd O O
at O O
musikken B-Scope O
de I-Scope O
spiller I-Scope O
er I-Scope O
så I-Scope O
ribbet O B-Cue
for O I-Cue
personlighet B-Scope O
. O O
Thanks for the details!, I have managed to reproduce the error.
This happens when multiple datasets where used in combination with F1 score, I have implemented a workaround now, if you pull again it should work (if it doesn't please let me know)
ps. We have found in multiple projects (and I've read it in other papers) that the normal seq
setup outperforms seq-bio
(CRF), even for BIO labeled data (https://robvanderg.github.io/doc/naacl2021.pdf, https://www.aclweb.org/anthology/2020.coling-main.583.pdf). In other words, a softmax layer is enough for bert-based models in these setups, and a CRF on top does not lead to further increase.
ps2. You can also define the metric in the dataset configuration, so you can set them per task. I've just updated the readme on this: https://github.com/machamp-nlp/machamp/blob/master/docs/metrics.md
Thanks for the recommendation, I'll try using the softmax layer instead!
The metrics work with the seq setup now, but not with the seq-bio setup.
I'm getting this error:
File "/Users/-/Desktop/machamp/machamp/models/crf_decoder.py", line 244, in get_metrics
if metric._true_positive_sum == None:
AttributeError:` 'SpanBasedF1Measure' object has no attribute '_true_positive_sum
Did a quick fix in the crf_decoder to make it work though:
@overrides
def get_metrics(self, reset: bool = False) -> Dict[str, float]:
main_metrics = {}
for metric_name, metric in self.metrics.items():
try:
m = metric.get_metric(reset)
except RuntimeError as e:
print(e)
m = {'precision':0.0, 'recall': 0.0, 'fscore': 0.0}
main_metrics[f".run/{self.task}/{metric_name}"] = m
return {**main_metrics}
Thanks for the update!
Hey, When I'm using either macro-f1 or micro-f1 as metrics for classification tasks, this error occurs. Acc works fine, but not F1-scores. I'll paste the stacktrace below. Any ideas on how to fix this?