Closed mcollardanuy closed 2 years ago
import requests
API_URL = "https://rel.cs.ru.nl/api"
text_doc = "BRAMSHAW. OM mile from Brook and Cetilitre, four from Lyndhurst, and six from Romsey."
# Example EL.
el_result = requests.post(API_URL, json={
"text": text_doc,
"spans": []
}).json()
Results:
>>> print(el_result)
[
[0, 8, 'BRAMSHAW', 'Bramshaw', 0.38727825954758405, 0.8461521863937378, 'ORG'],
[10, 2, 'OM', 'Order_of_Merit', 0.669421205764566, 0.4000522196292877, 'ORG'],
[23, 5, 'Brook', 'Brook_Island', 0.3527966262762801, 0.6422774791717529, 'PER'],
[54, 9, 'Lyndhurst', 'Electoral_district_of_Lyndhurst', 0.5974524509684982, 0.8372584581375122, 'LOC'],
[78, 6, 'Romsey', 'Romsey', 0.9106794983449681, 0.9339858293533325, 'LOC']
]
Where each result has: [start_char_pos, entity_string_length, entity_on_wikipedia, confidence_ner, confidence_linking, ner-type]
We provide the identified mentions
as a list of tuples, where the first element is the start character position of the entity, and the second element is the length of the entity.
import requests
API_URL = "https://rel.cs.ru.nl/api"
text_doc = "BRAMSHAW. OM mile from Brook and Cetilitre, four from Lyndhurst, and six from Romsey."
mentions = [(0, 8), (23, 5), (33, 9), (54, 9), (78, 6)]
# Example EL.
ed_result = requests.post(API_URL, json={
"text": text_doc,
"spans": mentions
}).json()
Results:
print(ed_result)
[
[0, 8, 'BRAMSHAW', 'Bramshaw', 0.38727825954758405, 0.0, 'NULL'],
[23, 5, 'Brook', 'Brook_Island', 0.41647935676531866, 0.0, 'NULL'],
[54, 9, 'Lyndhurst', 'Lyndhurst,_New_Jersey', 0.7174297794879929, 0.0, 'NULL'],
[78, 6, 'Romsey', 'Romsey', 0.908602895218785, 0.0, 'NULL']
]
See that there's no linking for mention "Cetilitre": it has been ignored.
wiki_2019
data:#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from REL.mention_detection import MentionDetection
from REL.utils import process_results
from REL.entity_disambiguation import EntityDisambiguation
from REL.ner import Cmns, load_flair_ner
base_url = "."
wiki_version = "wiki_2019"
def example_preprocessing():
# user does some stuff, which results in the format below.
text = "BRAMSHAW. OM mile from Brook and Cetilitre, four from Lyndhurst, and six from Romsey."
processed = {"example_lwm": [text, []]}
return processed
input_text = example_preprocessing()
mention_detection = MentionDetection(base_url, wiki_version)
tagger_ner = load_flair_ner("ner-fast")
mentions_dataset, n_mentions = mention_detection.find_mentions(input_text, tagger_ner)
config = {
"mode": "eval",
"model_path": base_url + "/" + wiki_version + "/generated/model",
}
model = EntityDisambiguation(base_url, wiki_version, config)
predictions, timing = model.predict(mentions_dataset)
result = process_results(mentions_dataset, predictions, input_text)
Results:
>>> print(input_text)
{'example_lwm': ['BRAMSHAW. OM mile from Brook and Cetilitre, four from Lyndhurst, and six from Romsey.', []]}
>>> print(predictions)
{'example_lwm': [
{'mention': 'Bramshaw', 'prediction': 'Bramshaw', 'candidates': ['Bramshaw', '#UNK#', '#UNK#', '#UNK#', '#UNK#', '#UNK#', '#UNK#'], 'conf_ed': 0.0, 'scores': ['0.4238759', '-0.22848074', '-0.22848074', '-0.22848074', '-0.22848074', '-0.22848074', '-0.22848074']},
{'mention': 'Brook', 'prediction': 'Brook_Island', 'candidates': ['Brook_Island', 'Brook,_Isle_of_Wight', 'Brook,_Kent', 'Brook,_New_Forest', 'Brook_trout', 'Stream', 'Bowling_Brook'], 'conf_ed': 0.0, 'scores': ['0.4285338', '0.41217393', '0.42006987', '0.41040137', '0.4021421', '0.39939427', '0.40175927']},
{'mention': 'Lyndhurst', 'prediction': 'Lyndhurst,_New_Jersey', 'candidates': ['Lyndhurst,_Hampshire', 'Lyndhurst,_Ohio', 'Lyndhurst,_New_Jersey', 'Electoral_district_of_Lyndhurst_(New_South_Wales)', 'Electoral_district_of_Lyndhurst', 'Lyndhurst,_South_Australia', 'Leeds_and_the_Thousand_Islands'], 'conf_ed': 0.0, 'scores': ['0.41701293', '0.41096336', '0.4363626', '0.41699076', '0.4330288', '0.42380357', '0.33699596']},
{'mention': 'Romsey', 'prediction': 'Romsey', 'candidates': ['Romsey', 'Shire_of_Romsey', 'Romsey,_Victoria', 'Romsey_(UK_Parliament_constituency)', 'Romsey_railway_station', 'Romsey_Football_Club', 'Mill_Road,_Cambridge'], 'conf_ed': 0.0, 'scores': ['0.45610768', '0.4045033', '0.41416538', '0.4050362', '0.4061722', '0.4024356', '0.33643818']}
]}
>>> print(result)
{'example_lwm': [
(0, 8, 'BRAMSHAW', 'Bramshaw', 0.0, 0.9271187782287598, 'ORG'),
(23, 5, 'Brook', 'Brook_Island', 0.0, 0.969516396522522, 'LOC'),
(54, 9, 'Lyndhurst', 'Lyndhurst,_New_Jersey', 0.0, 0.9971579313278198, 'LOC'),
(78, 6, 'Romsey', 'Romsey', 0.0, 0.9967823028564453, 'LOC')
]}
Note: see that candidate selection module can be evaluated from the predictions
(max seven candidates).
Two points on that:
Using the same script as in the previous comment, with the following changes to REL/entity_disambiguation.py
:
diff --git a/REL/entity_disambiguation.py b/REL/entity_disambiguation.py
index 70a43ee..5e87937 100644
--- a/REL/entity_disambiguation.py
+++ b/REL/entity_disambiguation.py
@@ -8,6 +8,8 @@ from pathlib import Path
from random import shuffle
from typing import Any, Dict
from urllib.parse import urlparse
+import urllib
+import pandas as pd
import numpy as np
import pkg_resources
@@ -30,11 +32,20 @@ for the ED step.
wiki_prefix = "en.wikipedia.org/wiki/"
+path = '/resources/wikipedia/extractedResources/'
+with open(path+'wikipedia2wikidata.json', 'r') as f:
+ wikipedia2wikidata = json.load(f)
+
+df = pd.read_csv('/resources/wikidata/wikidata_gazetteer.csv', low_memory=False, usecols=['wikidata_id'])
+wkdt_allcands = set(df["wikidata_id"].tolist())
+
+wikipedia_locs = set(dict(filter(lambda x: x[1] in wkdt_allcands, wikipedia2wikidata.items())).keys())
class EntityDisambiguation:
def __init__(self, base_url, wiki_version, user_config, reset_embeddings=False):
self.base_url = base_url
self.wiki_version = wiki_version
+ self.wikipedia_locs = wikipedia_locs
self.embeddings = {}
self.config = self.__get_config(user_config)
@@ -871,6 +882,7 @@ class EntityDisambiguation:
conll_doc = content[0].get("conll_doc", None)
for m in content:
named_cands = [c[0] for c in m["candidates"]]
+ named_cands = [c for c in named_cands if urllib.parse.quote(c.replace("_", " ")) in wikipedia_locs]
p_e_m = [min(1.0, max(1e-3, c[1])) for c in m["candidates"]]
try:
Results are now just location entities (see, for example, that "Brook trout" is not a possible candidate for mention "Brook"):
>>> print(predictions)
{'example_lwm': [
{'mention': 'Bramshaw', 'prediction': 'Bramshaw', 'candidates': ['Bramshaw', '#UNK#', '#UNK#', '#UNK#', '#UNK#', '#UNK#', '#UNK#'], 'conf_ed': 0.0, 'scores': ['0.4238759', '-0.22848074', '-0.22848074', '-0.22848074', '-0.22848074', '-0.22848074', '-0.22848074']},
{'mention': 'Brook', 'prediction': 'Brook_Island', 'candidates': ['Brook_Island', 'Brook,_Isle_of_Wight', 'Brook,_Kent', 'Brook,_New_Forest', 'Brook,_Surrey', 'Brook,_Indiana', 'Fork_Factory_Brook'], 'conf_ed': 0.0, 'scores': ['0.43120536', '0.41348878', '0.42142522', '0.4111408', '0.4155652', '0.41369182', '0.4096837']},
{'mention': 'Lyndhurst', 'prediction': 'Lyndhurst,_New_Jersey', 'candidates': ['Lyndhurst,_Hampshire', 'Lyndhurst,_Ohio', 'Lyndhurst,_New_Jersey', 'Electoral_district_of_Lyndhurst_(New_South_Wales)', 'Lyndhurst_(mansion)', 'Lyndhurst,_South_Australia', 'Leeds_and_the_Thousand_Islands'], 'conf_ed': 0.0, 'scores': ['0.42008537', '0.41830868', '0.43580437', '0.41616532', '0.41997015', '0.4254934', '0.4104377']},
{'mention': 'Romsey', 'prediction': 'Romsey', 'candidates': ['Romsey', 'Shire_of_Romsey', 'Romsey,_Victoria', 'Romsey_(UK_Parliament_constituency)', 'Romsey_railway_station', 'Romsey_Abbey', 'Mill_Road,_Cambridge'], 'conf_ed': 0.0, 'scores': ['0.45595652', '0.40478042', '0.41509593', '0.4053589', '0.4064365', '0.40317565', '0.41849172']}
]}
>>> print(result)
{'example_lwm': [
(0, 8, 'BRAMSHAW', 'Bramshaw', 0.0, 0.9271187782287598, 'ORG'),
(23, 5, 'Brook', 'Brook_Island', 0.0, 0.969516396522522, 'LOC'),
(54, 9, 'Lyndhurst', 'Lyndhurst,_New_Jersey', 0.0, 0.9971579313278198, 'LOC'),
(78, 6, 'Romsey', 'Romsey', 0.0, 0.9967823028564453, 'LOC')]}
Note that NER still may be interpreting mentions as non-LOC, but all possible candidates will be LOCs anyway. In the experiments, we may just want to filter out NE that have not been identified as LOCs.
To install it in poetry:
poetry add git+https://github.com/informagi/REL.git#main
Following these instructions.
Step 1: Filtering by Wikipedia title in our gazetteer, in generate_train_test.py
:
# Wikipedia to wikidata to gazetteer filtering:
path = "/resources/wikipedia/extractedResources/"
# Load wikipedia to wikidata converter:
with open(path + "wikipedia2wikidata.json", "r") as f:
wikipedia2wikidata = json.load(f)
with open(path + "wikidata2wikipedia.json", "r") as f:
wikidata2wikipedia = json.load(f)
# Load gazetteer:
gaz_df = pd.read_csv(
"/resources/wikidata/wikidata_gazetteer.csv",
low_memory=False,
usecols=["wikidata_id"],
)
# Keep only wikipedia entities in the gazetteer:
wkdt_allcands = set(gaz_df["wikidata_id"].tolist())
wikipedia_locs = set(
dict(filter(lambda x: x[1] in wkdt_allcands, wikipedia2wikidata.items())).keys()
)
Step 2: Generating the LwM dataset for training and evaluating the ED system, in generate_train_test.py
:
def process_lwm(self, dataset_split):
"""
Preprocesses LwM dataset in the format that is necessary for training and evaluating
the local ED model.
"""
dataset = pd.read_csv(
"/resources/develop/mcollardanuy/toponym-resolution/experiments/outputs/data/lwm/linking_df_split.tsv",
sep="\t",
)
if dataset_split == "train":
dataset = dataset[dataset["originalsplit"] == "train"]
elif dataset_split == "dev":
dataset = dataset[dataset["originalsplit"] == "dev"]
elif dataset_split == "test":
dataset = dataset[dataset["originalsplit"] == "test"]
dataset["annotations"] = dataset["annotations"].apply(
lambda x: literal_eval(str(x))
)
dataset["sentences"] = dataset["sentences"].apply(
lambda x: literal_eval(str(x))
)
dict_articles = dict()
for i, row in dataset.iterrows():
article_id = row["article_id"]
dict_articles[article_id] = []
dict_sentences = dict()
for sentence in row["sentences"]:
dict_sentences[int(sentence["sentence_pos"])] = sentence[
"sentence_text"
]
for annotation in row["annotations"]:
dict_mention = dict()
dict_mention["mention"] = annotation["mention"]
sent_idx = int(annotation["sent_pos"])
dict_mention["sent_idx"] = sent_idx
dict_mention["sentence"] = dict_sentences[sent_idx]
# Convert the gold standard Wikidata id to Wikipedia id (because
# of redirections, some times more than one Wikipedia title is
# assigned to each Wikidata id, we choose the most frequent one):
dict_mention["gold"] = "NIL"
gold_ids = wikidata2wikipedia.get(annotation["wkdt_qid"])
max_freq = 0
if gold_ids:
for k in gold_ids:
if k["freq"] > max_freq:
max_freq = k["freq"]
dict_mention["gold"] = k["title"]
dict_mention["gold"] = [
urllib.parse.unquote(dict_mention["gold"]).replace(" ", "_")
]
dict_mention["ngram"] = annotation["mention"]
dict_mention["context"] = ["", ""]
if sent_idx - 1 in dict_sentences:
dict_mention["context"][0] = dict_sentences[sent_idx - 1]
if sent_idx + 1 in dict_sentences:
dict_mention["context"][1] = dict_sentences[sent_idx + 1]
dict_mention["pos"] = annotation["mention_start"]
dict_mention["end_pos"] = annotation["mention_end"]
cands = self.get_candidates(annotation["mention"])
# Filter to candidates that are in the gazetteer:
cands = [
c
for c in cands
if urllib.parse.quote(c[0].replace("_", " ")) in wikipedia_locs
]
dict_mention["candidates"] = cands
dict_articles[article_id].append(dict_mention)
if dataset_split == "train":
self.__save(dict_articles, "lwm_train")
if dataset_split == "dev":
self.__save(dict_articles, "lwm_dev")
if dataset_split == "test":
self.__save(dict_articles, "lwm_test")
Step 3: Generate datasets from a new script:
from REL.wikipedia import Wikipedia
from REL.generate_train_test import GenTrainingTest
base_url = "./"
wiki_version = "wiki_2019/"
wikipedia = Wikipedia(base_url, wiki_version)
data_handler = GenTrainingTest(base_url, wiki_version, wikipedia)
for ds in ["train", "dev", "test"]:
data_handler.process_lwm(ds)
This will produce the following new files in ./REL/wiki_2019/generated/test_train_data
:
lwm_dev.pkl
lwm_test.pkl
lwm_train.pkl
Following these instructions.
Step 1: Changing list of datasets to load from training_datasets.py
.
def load(self):
"""
Loads respective datasets and processes coreferences.
:return: Returns training/evaluation datasets.
"""
datasets = {}
for ds in [
"lwm_train",
"lwm_dev",
]:
Step 2: Loading the datasets, from a new script:
from REL.training_datasets import TrainingEvaluationDatasets
datasets = TrainingEvaluationDatasets(base_url, wiki_version).load()
... where...
>>> print(datasets.keys())
dict_keys(['lwm_train', 'lwm_dev'])
>>> print(datasets["lwm_train"].keys())
dict_keys([4428937, 3516947, 10722895, ...]) # the article ids in the training set
>>> print(datasets["lwm_train"][3691281])
[{
'mention': 'Wragby',
'sent_idx': 5,
'sentence': 'On Friday the first stone of a new church at Wragby was laid, with all due formalities. £3000 has been subscribed byvountary contributions for the purpose of building and endowing this edifice, all given by the parishioners of Wragby—Lincoln Gaxette. ',
'gold': [{'title': 'Wragby', 'freq': 33}],
'ngram': 'Wragby',
'context': [
'A pious and benevolent lady, connected with the establishment, lately deceased, has, by her will, left £100,000 as a national legacy, for the express purposes of building churches; and Northampton, it appears, is one of the favoured towns destined to share her munificence. ',
'Wolverhampton Collegiate Church— It is designed to rescue this beautiful and venerable structure, second to none in the county but the cathedral at Lichfield, from the dilapidated state into which it hasfallen; and, with this view, the churchwardens have published an address to the nobility, clergy, and other inhabitants of the town and neighbourhood, soliciting their aid to enable them to accomplish their praiseworthy and desirable object. '
],
'pos': 45,
'end_pos': 51,
'candidates': [
['Wragby', 1.0],
['Wragby_railway_station', 0.952]
]
}, ...]
Step 1: Defining the config
from REL.entity_disambiguation import EntityDisambiguation
config = {
"mode": "train",
"model_path": "{}/{}/generated/model".format(base_url, wiki_version),
}
model = EntityDisambiguation(base_url, wiki_version, config)
Step 2: Train the model
# Train the model using lwm_train:
model.train(
datasets["lwm_train"], {k: v for k, v in datasets.items() if k != "lwm_train"}
)
Step 3: Train and predict using LR (to obtain confidence scores)
model_path_lr = "{}/{}/generated/".format(base_url, wiki_version)
model.train_LR(datasets, model_path_lr)
Step 1: Add the following in entity_disambiguation.py
, under instantiated named_cands
:
named_cands = [
c
for c in named_cands
if urllib.parse.quote(c.replace("_", " ")) in wikipedia_locs
]
Step 2: And then run end-to-end with new model:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from REL.mention_detection import MentionDetection
from REL.utils import process_results
from REL.entity_disambiguation import EntityDisambiguation
from REL.ner import Cmns, load_flair_ner
base_url = "."
wiki_version = "wiki_2019"
sentence = "Ashton-under-Lyne, Halifax, Sowerby, Sheffield."
def example_preprocessing(sentence):
# user does some stuff, which results in the format below.
text = sentence
processed = {"example_lwm": [text, []]}
return processed
input_text = example_preprocessing(sentence)
mention_detection = MentionDetection(base_url, wiki_version)
tagger_ner = load_flair_ner("ner-fast")
mentions_dataset, n_mentions = mention_detection.find_mentions(input_text, tagger_ner)
print("mentions_dataset", mentions_dataset)
print("n_mentions", n_mentions)
config = {
"mode": "eval",
"model_path": "{}/{}/generated/model".format(base_url, wiki_version),
}
model = EntityDisambiguation(base_url, wiki_version, config)
predictions, timing = model.predict(mentions_dataset)
result = process_results(mentions_dataset, predictions, input_text)
print()
print(input_text)
print(predictions)
print(result)
This is the type of structure you need to set up by creating the following folders and running specific scripts in specific folders. The base folder for us is rel_db
and all following folders are inside it:
├── generic
└─── lwm_rel_filtered
| ├── basic_data
| └── anchor_files
| └── generated
To do so, first of all:
generic
folder from here inside rel_db
lwm_rel_filtered
) inside rel_db
lwm_rel_filtered
create the basic_data
folderbasic_data
run the adapted version of WikiExtractor on a Wikipedia XML dump as below:
python {path/to/script/}WikiExtractor.py /resources/wikipedia/enwiki-20211001-pages-articles-multistream27.xml-p68475910p68864378.bz2 --links --filter_disambig_pages --processes 1 --bytes 1G
anchor_files
folder inside basic_data
text/AA/
folder to anchor_files
from REL.wikipedia import Wikipedia
from REL.wikipedia_yago_freq import WikipediaYagoFreq
wiki_version = "lwm_rel_filtered" base_url = "/resources/wikipedia/rel_db/" wikipedia = Wikipedia(base_url, wiki_version)
wiki_yago_freq = WikipediaYagoFreq(base_url, wiki_version, wikipedia) wiki_yago_freq.compute_wiki() wiki_yago_freq.compute_custom() wiki_yago_freq.store()
Thanks @fedenanni! I've moved it one folder up, directly under /resources/
, because I'll put all the data needed by REL there.
Changes to REL assuming it's cloned under experiments/
: rel_changes.txt
[step by step overview] Training your own embeddings
The preprocess.sh
script does not seem to be working directly, so I am executing each step by step.
To install wikipedia2vec, pip install wikipedia2vec
or poetry add wikipedia2vec
give the same error, possibly due to incompatibilities with the python version (see here. I have managed to install it following this guide:
git clone https://github.com/studio-ousia/wikipedia2vec.git
cd wikipedia2vec
pip install Cython
./cythonize.sh
pip install .
Actually no, this only brings a new error right after ModuleNotFoundError: No module named 'wikipedia2vec.dictionary'
Apparently wikipedia2vec
has serious incompatibility issues with newer versions of python, trying to install it now with python 3.5 in a conda environment
Ok, only python 3.8
seems to be compatible. Created a pyenv
where I am trying to set it up
Done in https://github.com/Living-with-machines/toponym-resolution/tree/adapt-rel, closing this as we're reviewing all the pipeline in #141
Settings:
Integration with our pipeline is in issue #72.
At the moment, I'm running the different options from
/resources/develop/mcollardanuy/REL
in thetoponymVM
.