Closed jasonslepicka closed 6 years ago
thank, will test it tomorrow in elt engine
@jasonslepicka because etl engine uses the sandpaper from pypi package directly (from digsandpaper.elasticsearch_indexing.index_knowledge_graph import index_knowledge_graph_fields
), 0.1.4-r070
is not synchronized to the latest commit (which fixed the kg.iteritems()
in index_knowledge_graph
), can you please release a new dev version?
Or I can find out if I can mount sandpaper volume to etl engine's python site package directory and let it use the local version
@jasonslepicka never mind, I mounted local sandpaper to the site-packages.
@szeke the provenance in etk2 breaks sandpaper. sandpaper uses provenance
in each field previously (https://github.com/usc-isi-i2/dig-sandpaper/blob/migrate-to-python3/digsandpaper/elasticsearch_indexing/index_knowledge_graph.py#L62), how should it do now, get from global provenance structure? Moreover, do we need to update DIGUI since it needs to show provenance also.
my test etk module:
import os, sys
from etk.etk import ETK
from etk.knowledge_graph import KGSchema
from etk.extractors.glossary_extractor import GlossaryExtractor
from etk.etk_module import ETKModule
import json
class ExampleETKModule(ETKModule):
"""
Abstract class for extraction module
"""
def __init__(self, etk):
ETKModule.__init__(self, etk)
glossary_list = [
'runqi',
'sylvia',
'dongyu',
'mayank',
'pedro',
'amandeep',
'yixiang'
]
self.name_extractor = GlossaryExtractor(glossary_list, "name_extractor",
self.etk.default_tokenizer,
case_sensitive=False, ngrams=1)
def process_document(self, doc):
"""
Add your code for processing the document
"""
descriptions = doc.select_segments("projects[*].description")
projects = doc.select_segments("projects[*]")
for d, p in zip(descriptions, projects):
names = doc.extract(self.name_extractor, d)
p.store(names, "members")
doc.kg.add_doc_value("developer", "projects[*].members[*]")
output in es (removed provenance snippet in order to make sandpaper work, search in field developer
doesn't work):
{
"raw_content": "",
"projects": [
{
"name": "etk",
"description": "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others.",
"members": [
"Dongyu",
"Sylvia",
"Amandeep"
]
},
{
"name": "rltk",
"description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.",
"members": [
"Pedro",
"Mayank",
"Yixiang"
]
}
],
"indexed": {
"developer": {
"provenance_count": 0,
"high_confidence_keys": [],
"key_count": 6
}
},
"timestamp_crawl": "2018-04-26T20:20:33.851707",
"knowledge_graph": {
"developer": [
{
"value": "Dongyu",
"key": "dongyu"
},
{
"value": "Sylvia",
"key": "sylvia"
},
{
"value": "Amandeep",
"key": "amandeep"
},
{
"value": "Pedro",
"key": "pedro"
},
{
"value": "Mayank",
"key": "mayank"
},
{
"value": "Yixiang",
"key": "yixiang"
}
]
},
"type": "etk2_dev",
"tld": "test.org",
"doc_id": "test001",
"url": "www.dig_test_jl.org/test001",
"@timestamp": "2018-04-26T23:38:47.365Z",
"@version": "1",
"provenances": [
{
"@id": 0,
"method": "name_extractor",
"origin_record": [
{
"start_char": 47,
"end_char": 53,
"path": "projects.[0].description"
}
],
"@type": "extraction_provenance_record",
"confidence": 1
},
{
"@id": 1,
"method": "name_extractor",
"origin_record": [
{
"start_char": 58,
"end_char": 64,
"path": "projects.[0].description"
}
],
"@type": "extraction_provenance_record",
"confidence": 1
},
{
"@id": 2,
"method": "name_extractor",
"origin_record": [
{
"start_char": 70,
"end_char": 78,
"path": "projects.[0].description"
}
],
"@type": "extraction_provenance_record",
"confidence": 1
},
{
"destination": "projects.[0].members",
"field": null,
"extraction_provenance_record_id": [
0,
1,
2
],
"doc_id": null,
"@type": "storage_provenance_record"
},
{
"@id": 3,
"method": "name_extractor",
"origin_record": [
{
"start_char": 39,
"end_char": 44,
"path": "projects.[1].description"
}
],
"@type": "extraction_provenance_record",
"confidence": 1
},
{
"@id": 4,
"method": "name_extractor",
"origin_record": [
{
"start_char": 46,
"end_char": 52,
"path": "projects.[1].description"
}
],
"@type": "extraction_provenance_record",
"confidence": 1
},
{
"@id": 5,
"method": "name_extractor",
"origin_record": [
{
"start_char": 54,
"end_char": 61,
"path": "projects.[1].description"
}
],
"@type": "extraction_provenance_record",
"confidence": 1
},
{
"destination": "projects.[1].members",
"field": null,
"extraction_provenance_record_id": [
3,
4,
5
],
"doc_id": null,
"@type": "storage_provenance_record"
}
],
"@execution_profile": {
"@run_core_time": 0.06945514678955078,
"@worker_id": 0,
"@doc_sent_time": "2018-04-26T23:38:47.351163",
"@doc_length": 521,
"@doc_processed_time": 0.0702354907989502,
"@doc_arrived_time": "2018-04-26T23:38:47.280927",
"@doc_wait_time": 0
}
}
You can test this locally by building the image and changing the tag in the docker etl engine compose file
docker build -f Dockerfile.local -t sandpaperlocal:latest .