usc-isi-i2 / dig-etl-engine

Download DIG to run on your laptop or server.
http://usc-isi-i2.github.io/dig/
MIT License
101 stars 39 forks source link

website should be populated from `tld` if present #190

Closed szeke closed 6 years ago

szeke commented 6 years ago

In sage we have examples where the tld is in one domain, and the url is in a different domain. The example is oefresearch.com which publishes their files in google spreadsheets. The website field gets populated as google.com and should be oefresearch.com

szeke commented 6 years ago

Example CDR doc:

{"_index":"sage_kg","_type":"ads","_id":"F389CF6A9C03C14DD4371DEF3FCF8245BAEE5FA0B64D7B85E7F834A2777592AB","_version":1,"found":true,"_source":{"raw_content":"<html><pre>{\n  \"attempt\": 1, \n  \"ccode\": 581, \n  \"country\": \"Comoros\", \n  \"date\": \"1987/11/30\", \n  \"day\": 30, \n  \"month\": 11, \n  \"success\": 0, \n  \"year\": 1987\n}</pre></html>","indexed":{"country":{"provenance_count":1,"high_confidence_keys":["comoros"],"key_count":1,"other_method":{"other_segment":[{"value":"comoros","key":"comoros"}]}},"description":{"provenance_count":1,"high_confidence_keys":["description"],"key_count":1,"other_method":{"content_strict":[{"value":"{   \n \"attempt\": 1,    \n \"ccode\": 581,    \n \"country\": \"Comoros\",    \n \"date\": \"1987/11/30\",    \n \"day\": 30,    \n \"month\": 11,    \n \"success\": 0,    \n \"year\": 1987   \n }","key":"description"}]}},"website":{"provenance_count":1,"high_confidence_keys":["google.com"],"key_count":1,"other_method":{"other_segment":[{"value":"google.com","key":"google.com"}]}},"outcome":{"provenance_count":1,"high_confidence_keys":["0"],"key_count":1,"other_method":{"other_segment":[{"value":"0","key":"0"}]}}},"reign_coups":{"date":"1987/11/30","ccode":581,"country":"Comoros","month":11,"year":1987,"success":0,"title":"Regime change in Comoros in 1987","type":["Event","Regime Change"],"attempt":1,"day":30},"timestamp_crawl":"2018-01-17T07:21:48.825064","knowledge_graph":{"country":[{"provenance":[{"source":{"context":{"start":0,"input":"tokens","end":1,"text":" <etk 'attribute' = 'country'>comoros</etk>  "},"document_id":"F389CF6A9C03C14DD4371DEF3FCF8245BAEE5FA0B64D7B85E7F834A2777592AB","segment":"other_segment"},"extracted_value":"comoros","method":"extract_using_dictionary","confidence":{"extraction":1.0}}],"value":"comoros","key":"comoros","confidence":1}],"description":[{"provenance":[{"method":"rearrange_description","source":{"document_id":"F389CF6A9C03C14DD4371DEF3FCF8245BAEE5FA0B64D7B85E7F834A2777592AB","segment":"content_strict"}}],"value":"{   \n \"attempt\": 1,    \n \"ccode\": 581,    \n \"country\": \"Comoros\",    \n \"date\": \"1987/11/30\",    \n \"day\": 30,    \n \"month\": 11,    \n \"success\": 0,    \n \"year\": 1987   \n }","key":"description","confidence":1}],"website":[{"provenance":[{"source":{"document_id":"F389CF6A9C03C14DD4371DEF3FCF8245BAEE5FA0B64D7B85E7F834A2777592AB","segment":"url"},"extracted_value":"google.com","method":"extract_website_domain","confidence":{"extraction":1.0}}],"value":"google.com","key":"google.com","confidence":1}],"outcome":[{"provenance":[{"source":{"document_id":"F389CF6A9C03C14DD4371DEF3FCF8245BAEE5FA0B64D7B85E7F834A2777592AB","segment":"content_extraction.reign_coups__success.[0]"},"extracted_value":"0","method":"extract_as_is","confidence":{"extraction":1.0}}],"value":"0","key":"0","confidence":1}]},"document_id":"F389CF6A9C03C14DD4371DEF3FCF8245BAEE5FA0B64D7B85E7F834A2777592AB","content_extraction":{"reign_coups__date":[{"simple_tokens_original_case":["1987","/","11","/","30"],"text":"1987/11/30","simple_tokens":["1987","/","11","/","30"]}],"reign_coups__country":[{"simple_tokens_original_case":["Comoros"],"text":"Comoros","simple_tokens":["comoros"]}],"reign_coups__success":[{"simple_tokens_original_case":["0"],"text":"0","simple_tokens":["0"]}],"content_strict":{"simple_tokens_original_case":["{","\n","\"","attempt","\"",":","1",",","\n","\"","ccode","\"",":","581",",","\n","\"","country","\"",":","\"","Comoros","\"",",","\n","\"","date","\"",":","\"","1987","/","11","/","30","\"",",","\n","\"","day","\"",":","30",",","\n","\"","month","\"",":","11",",","\n","\"","success","\"",":","0",",","\n","\"","year","\"",":","1987","\n","}"],"text":"{   \n \"attempt\": 1,    \n \"ccode\": 581,    \n \"country\": \"Comoros\",    \n \"date\": \"1987/11/30\",    \n \"day\": 30,    \n \"month\": 11,    \n \"success\": 0,    \n \"year\": 1987   \n }","simple_tokens":["{","\n","\"","attempt","\"",":","1",",","\n","\"","ccode","\"",":","581",",","\n","\"","country","\"",":","\"","comoros","\"",",","\n","\"","date","\"",":","\"","1987","/","11","/","30","\"",",","\n","\"","day","\"",":","30",",","\n","\"","month","\"",":","11",",","\n","\"","success","\"",":","0",",","\n","\"","year","\"",":","1987","\n","}"]},"title":{"simple_tokens_original_case":[],"text":"","simple_tokens":[]},"url":{"simple_tokens_original_case":["https",":","/","/","docs",".","google",".","com","/","spreadsheets","/","d","/","1mrtORyhXw9TJMBYLAGPrikA4VDpla","_","Eq7L","-","NsEQ5VXg","/","edit","#","gid","=","1823216053","#","387"],"text":"https://docs.google.com/spreadsheets/d/1mrtORyhXw9TJMBYLAGPrikA4VDpla_Eq7L-NsEQ5VXg/edit#gid=1823216053#387","simple_tokens":["https",":","/","/","docs",".","google",".","com","/","spreadsheets","/","d","/","1mrtoryhxw9tjmbylagprika4vdpla","_","eq7l","-","nseq5vxg","/","edit","#","gid","=","1823216053","#","387"]}},"type":"sage_kg","tld":"oefresearch.org","doc_id":"F389CF6A9C03C14DD4371DEF3FCF8245BAEE5FA0B64D7B85E7F834A2777592AB","url":"https://docs.google.com/spreadsheets/d/1mrtORyhXw9TJMBYLAGPrikA4VDpla_Eq7L-NsEQ5VXg/edit#gid=1823216053#387","@timestamp":"2018-01-17T07:22:16.101Z","prefilter_filter_outcome":"no_action","@version":"1","@execution_profile":{"@run_core_time":0.6089091300964355,"@worker_id":0,"@doc_sent_time":"2018-01-17T07:22:16.093924","@etk_process_time":0.6088428497314453,"@doc_processed_time":0.6110811233520508,"@doc_length":846,"@etk_start_time":"2018-01-17T07:22:15.485021","@doc_arrived_time":"2018-01-17T07:22:15.482843","@doc_wait_time":0.008261919021606445,"@etk_end_time":"2018-01-17T07:22:16.093864"}}}
saggu commented 6 years ago

Not sure what should happen in this case, keep it open for now

saggu commented 6 years ago

This was fixed a while ago