Open nicolauduran45 opened 1 week ago
This a function I was using
import numpy as np
def combine_entities(entities):
try:
combined_entities = []
i = 0
while i < len(entities):
current_entity = entities[i]
# Check if we can merge with the next entity
if i < len(entities) - 1:
next_entity = entities[i + 1]
# Conditions to combine entities:
if (current_entity['word'][-1].islower() and
next_entity['word'][0].islower()):
# Merge the text, adjust the end position, and take the average score
merged_entity = {
'start': current_entity['start'],
'end': next_entity['end'],
'entity_group': next_entity['entity_group'],
'score': (current_entity['score'] + next_entity['score']) / 2,
'word': current_entity['word'] + next_entity['word']
}
# Add the merged entity to the list
combined_entities.append(merged_entity)
# Skip the next entity since it's merged
i += 2
continue
# If no merge occurred, add the current entity as-is
combined_entities.append(current_entity)
i += 1
return np.array(combined_entities, dtype=object)
except:
return entities
to obtain something like this, combining ORG and POSTCODE
array([{'end': 44, 'entity_group': 'ORG', 'score': 0.9665974378585815, 'start': 0, 'word': 'IREC—Catalonia Institute for Energy Research'},
{'end': 80, 'entity_group': 'ADDRESS', 'score': 0.9999188184738159, 'start': 46, 'word': 'C. Jardins de les Dones de Negre 1'},
{'end': 87, 'entity_group': 'POSTALCODE', 'score': 0.9996729493141174, 'start': 82, 'word': '08930'},
{'end': 108, 'entity_group': 'CITY', 'score': 0.9998738765716553, 'start': 88, 'word': 'Sant Adrià del Besòs'},
{'start': 110, 'end': 119, 'entity_group': 'REGION', 'score': 0.998821884393692, 'word': 'Barcelona'},
{'end': 127, 'entity_group': 'COUNTRY', 'score': 0.9999210834503174, 'start': 122, 'word': 'Spain'}],
dtype=object)
To process NER, examples this one
to merge wronly parsed entities (here POSTALCODE) and to remove entities that are only punctuation (here "(" in REGION)