usc-isi-i2 / dig-sandpaper

MIT License
4 stars 3 forks source link

Migrate to python3 #7

Closed jasonslepicka closed 6 years ago

jasonslepicka commented 6 years ago

You can test this locally by building the image and changing the tag in the docker etl engine compose file docker build -f Dockerfile.local -t sandpaperlocal:latest .

GreatYYX commented 6 years ago

thank, will test it tomorrow in elt engine

GreatYYX commented 6 years ago

@jasonslepicka because etl engine uses the sandpaper from pypi package directly (from digsandpaper.elasticsearch_indexing.index_knowledge_graph import index_knowledge_graph_fields), 0.1.4-r070 is not synchronized to the latest commit (which fixed the kg.iteritems() in index_knowledge_graph), can you please release a new dev version?

GreatYYX commented 6 years ago

Or I can find out if I can mount sandpaper volume to etl engine's python site package directory and let it use the local version

GreatYYX commented 6 years ago

@jasonslepicka never mind, I mounted local sandpaper to the site-packages.

@szeke the provenance in etk2 breaks sandpaper. sandpaper uses provenance in each field previously (https://github.com/usc-isi-i2/dig-sandpaper/blob/migrate-to-python3/digsandpaper/elasticsearch_indexing/index_knowledge_graph.py#L62), how should it do now, get from global provenance structure? Moreover, do we need to update DIGUI since it needs to show provenance also.

GreatYYX commented 6 years ago

my test etk module:

import os, sys
from etk.etk import ETK
from etk.knowledge_graph import KGSchema
from etk.extractors.glossary_extractor import GlossaryExtractor
from etk.etk_module import ETKModule
import json

class ExampleETKModule(ETKModule):
    """
    Abstract class for extraction module
    """
    def __init__(self, etk):
        ETKModule.__init__(self, etk)
        glossary_list = [
            'runqi',
            'sylvia',
            'dongyu',
            'mayank',
            'pedro',
            'amandeep',
            'yixiang'
        ]
        self.name_extractor = GlossaryExtractor(glossary_list, "name_extractor",
                                                self.etk.default_tokenizer,
                                                case_sensitive=False, ngrams=1)

    def process_document(self, doc):
        """
        Add your code for processing the document
        """

        descriptions = doc.select_segments("projects[*].description")
        projects = doc.select_segments("projects[*]")

        for d, p in zip(descriptions, projects):
            names = doc.extract(self.name_extractor, d)
            p.store(names, "members")

        doc.kg.add_doc_value("developer", "projects[*].members[*]")

output in es (removed provenance snippet in order to make sandpaper work, search in field developer doesn't work):

{
  "raw_content": "",
  "projects": [
    {
      "name": "etk",
      "description": "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others.",
      "members": [
        "Dongyu",
        "Sylvia",
        "Amandeep"
      ]
    },
    {
      "name": "rltk",
      "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.",
      "members": [
        "Pedro",
        "Mayank",
        "Yixiang"
      ]
    }
  ],
  "indexed": {
    "developer": {
      "provenance_count": 0,
      "high_confidence_keys": [],
      "key_count": 6
    }
  },
  "timestamp_crawl": "2018-04-26T20:20:33.851707",
  "knowledge_graph": {
    "developer": [
      {
        "value": "Dongyu",
        "key": "dongyu"
      },
      {
        "value": "Sylvia",
        "key": "sylvia"
      },
      {
        "value": "Amandeep",
        "key": "amandeep"
      },
      {
        "value": "Pedro",
        "key": "pedro"
      },
      {
        "value": "Mayank",
        "key": "mayank"
      },
      {
        "value": "Yixiang",
        "key": "yixiang"
      }
    ]
  },
  "type": "etk2_dev",
  "tld": "test.org",
  "doc_id": "test001",
  "url": "www.dig_test_jl.org/test001",
  "@timestamp": "2018-04-26T23:38:47.365Z",
  "@version": "1",
  "provenances": [
    {
      "@id": 0,
      "method": "name_extractor",
      "origin_record": [
        {
          "start_char": 47,
          "end_char": 53,
          "path": "projects.[0].description"
        }
      ],
      "@type": "extraction_provenance_record",
      "confidence": 1
    },
    {
      "@id": 1,
      "method": "name_extractor",
      "origin_record": [
        {
          "start_char": 58,
          "end_char": 64,
          "path": "projects.[0].description"
        }
      ],
      "@type": "extraction_provenance_record",
      "confidence": 1
    },
    {
      "@id": 2,
      "method": "name_extractor",
      "origin_record": [
        {
          "start_char": 70,
          "end_char": 78,
          "path": "projects.[0].description"
        }
      ],
      "@type": "extraction_provenance_record",
      "confidence": 1
    },
    {
      "destination": "projects.[0].members",
      "field": null,
      "extraction_provenance_record_id": [
        0,
        1,
        2
      ],
      "doc_id": null,
      "@type": "storage_provenance_record"
    },
    {
      "@id": 3,
      "method": "name_extractor",
      "origin_record": [
        {
          "start_char": 39,
          "end_char": 44,
          "path": "projects.[1].description"
        }
      ],
      "@type": "extraction_provenance_record",
      "confidence": 1
    },
    {
      "@id": 4,
      "method": "name_extractor",
      "origin_record": [
        {
          "start_char": 46,
          "end_char": 52,
          "path": "projects.[1].description"
        }
      ],
      "@type": "extraction_provenance_record",
      "confidence": 1
    },
    {
      "@id": 5,
      "method": "name_extractor",
      "origin_record": [
        {
          "start_char": 54,
          "end_char": 61,
          "path": "projects.[1].description"
        }
      ],
      "@type": "extraction_provenance_record",
      "confidence": 1
    },
    {
      "destination": "projects.[1].members",
      "field": null,
      "extraction_provenance_record_id": [
        3,
        4,
        5
      ],
      "doc_id": null,
      "@type": "storage_provenance_record"
    }
  ],
  "@execution_profile": {
    "@run_core_time": 0.06945514678955078,
    "@worker_id": 0,
    "@doc_sent_time": "2018-04-26T23:38:47.351163",
    "@doc_length": 521,
    "@doc_processed_time": 0.0702354907989502,
    "@doc_arrived_time": "2018-04-26T23:38:47.280927",
    "@doc_wait_time": 0
  }
}