Open BudBudding opened 2 months ago
嗨,您好,感谢您对本项目的关注。
这个是我的一个例子,我是按照DuEEData的格式修改的,DuEEData是DuEE-Fin{ "id": "VOA_EN_NW_2016.04.19.3291980_14", "text": "Laws seen as undermining LGBT rights in several Southern states have sparked protests and boycott threats from businesses and entertainers, as well as counterdemonstrations by conservatives", "event_list": [ { "event_type": "Conflict:Demonstrate", "trigger": "protests", "trigger_start_index": 12, "arguments": [ { "argument_start_index": 8, "role": "Place", "argument": "Southern states" } ], "class": "Conflict:Demonstrate" } ], "img_url": [ "VOA_EN_NW_2016.04.19.3291980_1.jpg", "VOA_EN_NW_2016.04.19.3291980_3.jpg", "VOA_EN_NW_2016.04.19.3291980_5.jpg", "VOA_EN_NW_2016.04.19.3291980_0.jpg", "VOA_EN_NW_2016.04.19.3291980_4.jpg", "VOA_EN_NW_2016.04.19.3291980_2.jpg" ], },
了解了。您需要根据Doc2EDAG的格式重新整理数据格式,对于英文数据来说,可以参考下面的例子(这里是一条数据)。注意里面的span都是按空格tokenize之后的indices。此外,如果是自有数据,则需要根据事件类型模板自行调整template,实现方案可参考这个文件夹:https://github.com/Spico197/DocEE/tree/main/dee/event_types
[
"scenario_en_kairos_14",
{
"sentences": [
"As of early Tuesday there was no claim of responsibility . Prayuth Chan - ocha , the head of Thailand \u2019 s military government , said that the authorities were searching for a person seen on closed - circuit footage but that it was not clear who the person was , news agencies reported .",
"A spokesman for the police , Lt . Gen . Prawut Thavornsiri , told a Thai television interviewer that \u201c we haven \u2019 t concluded anything . \u201d The authorities said they were reviewing footage from 15 security cameras in the area but that the rush - hour crowds made deciphering the video difficult .",
"\u201c The shrine was very crowded , \u201d General Prawut said . \u201c It \u2019 s not clear even looking at the CCTV footage . \u201d",
"The bomb , General Prawut said , was placed under a bench on the outer rim of the shrine \u2019 s grounds . Initially , the police said they had discovered at least two additional devices that they suspected were unexploded bombs inside the shrine and said other bombs may have been placed in the area , yelling at bystanders : \u201c Get out ! Get out ! \u201d"
],
"ann_valid_mspans": [
"Prayuth Chan - ocha",
"head",
"Thailand",
"military",
"government",
"authorities",
"person",
"who",
"spokesman",
"police",
"Gen",
"Prawut Thavornsiri",
"Thai",
"television",
"interviewer",
"we",
"they",
"area",
"crowds",
"shrine",
"General",
"Prawut",
"bomb",
"devices",
"that",
"bombs",
"bystanders",
"news agencies",
"outer rim of the shrine \u2019 s grounds",
"discovered",
"reviewing",
"searching"
],
"ann_valid_dranges": [
[
0,
11,
15
],
[
0,
17,
18
],
[
0,
19,
20
],
[
0,
22,
23
],
[
0,
23,
24
],
[
0,
28,
29
],
[
1,
29,
30
],
[
0,
33,
34
],
[
0,
48,
49
],
[
0,
46,
47
],
[
1,
1,
2
],
[
1,
4,
5
],
[
3,
26,
27
],
[
1,
8,
9
],
[
1,
10,
12
],
[
1,
15,
16
],
[
1,
16,
17
],
[
1,
17,
18
],
[
1,
20,
21
],
[
1,
31,
32
],
[
3,
28,
29
],
[
3,
37,
38
],
[
1,
41,
42
],
[
3,
55,
56
],
[
1,
48,
49
],
[
2,
2,
3
],
[
3,
44,
45
],
[
2,
8,
9
],
[
3,
3,
4
],
[
2,
9,
10
],
[
3,
4,
5
],
[
3,
1,
2
],
[
3,
35,
36
],
[
3,
36,
37
],
[
3,
41,
42
],
[
3,
48,
49
],
[
3,
59,
60
],
[
0,
51,
53
],
[
3,
14,
22
],
[
3,
30,
31
],
[
1,
33,
34
],
[
0,
30,
31
]
],
"ann_mspan2dranges": {
"Prayuth Chan - ocha": [
[
0,
11,
15
]
],
"head": [
[
0,
17,
18
]
],
"Thailand": [
[
0,
19,
20
]
],
"military": [
[
0,
22,
23
]
],
"government": [
[
0,
23,
24
]
],
"authorities": [
[
0,
28,
29
],
[
1,
29,
30
]
],
"person": [
[
0,
33,
34
],
[
0,
48,
49
]
],
"who": [
[
0,
46,
47
]
],
"spokesman": [
[
1,
1,
2
]
],
"police": [
[
1,
4,
5
],
[
3,
26,
27
]
],
"Gen": [
[
1,
8,
9
]
],
"Prawut Thavornsiri": [
[
1,
10,
12
]
],
"Thai": [
[
1,
15,
16
]
],
"television": [
[
1,
16,
17
]
],
"interviewer": [
[
1,
17,
18
]
],
"we": [
[
1,
20,
21
]
],
"they": [
[
1,
31,
32
],
[
3,
28,
29
],
[
3,
37,
38
]
],
"area": [
[
1,
41,
42
],
[
3,
55,
56
]
],
"crowds": [
[
1,
48,
49
]
],
"shrine": [
[
2,
2,
3
],
[
3,
44,
45
]
],
"General": [
[
2,
8,
9
],
[
3,
3,
4
]
],
"Prawut": [
[
2,
9,
10
],
[
3,
4,
5
]
],
"bomb": [
[
3,
1,
2
]
],
"devices": [
[
3,
35,
36
]
],
"that": [
[
3,
36,
37
]
],
"bombs": [
[
3,
41,
42
],
[
3,
48,
49
]
],
"bystanders": [
[
3,
59,
60
]
],
"news agencies": [
[
0,
51,
53
]
],
"outer rim of the shrine \u2019 s grounds": [
[
3,
14,
22
]
],
"discovered": [
[
3,
30,
31
]
],
"reviewing": [
[
1,
33,
34
]
],
"searching": [
[
0,
30,
31
]
]
},
"ann_mspan2guess_field": {
"Prayuth Chan - ocha": "PER",
"head": "PER",
"Thailand": "GPE",
"military": "ORG",
"government": "ORG",
"authorities": "PER",
"person": "PER",
"who": "PER",
"spokesman": "PER",
"police": "PER",
"Gen": "PER",
"Prawut Thavornsiri": "PER",
"Thai": "GPE",
"television": "ORG",
"interviewer": "PER",
"we": "PER",
"they": "PER",
"area": "LOC",
"crowds": "PER",
"shrine": "FAC",
"General": "PER",
"Prawut": "PER",
"bomb": "WEA",
"devices": "WEA",
"that": "WEA",
"bombs": "WEA",
"bystanders": "PER",
"news agencies": "ORG",
"outer rim of the shrine \u2019 s grounds": "LOC",
"discovered": "Trigger",
"reviewing": "Trigger",
"searching": "Trigger"
},
"recguid_eventname_eventdict_list": [
[
0,
"Cognitive.IdentifyCategorize.Unspecified",
{
"IdentifiedRole": null,
"Identifier": null,
"IdentifiedObject": null,
"Place": null,
"Trigger": "discovered"
}
],
[
1,
"Cognitive.Inspection.SensoryObserve",
{
"ObservedEntity": null,
"Place": null,
"Observer": null,
"Instrument": null,
"Trigger": "reviewing"
}
],
[
2,
"Cognitive.IdentifyCategorize.Unspecified",
{
"IdentifiedRole": null,
"Identifier": null,
"IdentifiedObject": null,
"Place": null,
"Trigger": "searching"
}
]
],
"doc_type": "m2m"
}
]
我尝试下载了wiki数据集,bash run_ptpcg_wikievents_wTgg.sh后,100轮的结果为
嗨,抱歉回复晚了。我不记得具体数值了,但印象中模型在英文上的效果确实很差,可能的原因有:
convert_tokens_to_ids
,而没有切成subword,会导致输入序列中存在很多[UNK]
,对模型理解影响很大
您好奥!我想请教一个问题,我将英文数据集改为了DuEEData的数据格式,并且将--bert_model='/home/pretrained_model/bert_en' \,在dee_task.py里面将 elif self.run_mode == "dueefin_wo_tgg": self.train_file_name = "dueefin_train_wo_tgg.json" self.dev_file_name = "dueefin_dev_wo_tgg.json" self.test_file_name = "dueefin_submit_wo_tgg.json" self.inference_file_name = "dueefin_submit_wo_tgg.json" self.doc_lang = "en" elif self.run_mode == "dueefin_w_tgg": self.train_file_name = "dueefin_train_w_tgg.json" self.dev_file_name = "dueefin_dev_w_tgg.json" self.test_file_name = "dueefin_submit_w_tgg.json" self.inference_file_name = "dueefin_submit_w_tgg.json" self.doc_lang = "en" 的self.doc_lang = "zh"都修改为英文en。出现了 报错。我想请问一下 我将英文数据集改为DuEEData格式跑这个是可行的吗