sul-dlss-deprecated / rialto-etl

ETL tools for RIALTO, Stanford Libraries' research intelligence project
https://library.stanford.edu/projects/rialto
Apache License 2.0
3 stars 0 forks source link

Transform DOIs with <> #338

Closed justinlittman closed 5 years ago

justinlittman commented 5 years ago

Transforming a publication with a DOI containing < or > results in an error.

Here is an example publication that produces this error:

{
    "UID": "WOS:A1982NQ88900015",
    "static_data": {
        "summary": {
            "pub_info": {
                "coverdate": 1982,
                "vol": 49,
                "pubyear": 1982,
                "issue": 11,
                "sortdate": "1982-01-01",
                "has_abstract": "N",
                "pubtype": "Journal",
                "page": {
                    "end": 2299,
                    "begin": 2295,
                    "page_count": 5,
                    "content": "2295-2299"
                }
            },
            "names": {
                "count": 4,
                "name": [{
                    "seq_no": 1,
                    "role": "author",
                    "full_name": "CLARKE, D",
                    "last_name": "CLARKE",
                    "display_name": "CLARKE, D",
                    "wos_standard": "CLARKE, D",
                    "daisng_id": 4971073,
                    "first_name": "D"
                }, {
                    "seq_no": 2,
                    "role": "author",
                    "full_name": "MARTINEZ, A",
                    "last_name": "MARTINEZ",
                    "display_name": "MARTINEZ, A",
                    "wos_standard": "MARTINEZ, A",
                    "daisng_id": 13458368,
                    "first_name": "A"
                }, {
                    "seq_no": 3,
                    "role": "author",
                    "full_name": "COX, RS",
                    "last_name": "COX",
                    "display_name": "COX, RS",
                    "wos_standard": "COX, RS",
                    "daisng_id": 291086,
                    "first_name": "RS"
                }, {
                    "seq_no": 4,
                    "role": "author",
                    "full_name": "GOFFINET, DR",
                    "last_name": "GOFFINET",
                    "display_name": "GOFFINET, DR",
                    "wos_standard": "GOFFINET, DR",
                    "daisng_id": 196091,
                    "first_name": "DR"
                }]
            },
            "doctypes": {
                "doctype": "Article",
                "count": 1
            },
            "publishers": {
                "publisher": {
                    "names": {
                        "count": 1,
                        "name": {
                            "seq_no": 1,
                            "role": "publisher",
                            "full_name": "WILEY-LISS",
                            "addr_no": 1,
                            "display_name": "WILEY-LISS"
                        }
                    },
                    "address_spec": {
                        "city": "NEW YORK",
                        "addr_no": 1,
                        "full_address": "DIV JOHN WILEY & SONS INC, 605 THIRD AVE, NEW YORK, NY 10158-0012"
                    }
                }
            },
            "EWUID": {
                "WUID": {
                    "coll_id": "WOS"
                },
                "edition": {
                    "value": "WOS.SCI"
                }
            },
            "titles": {
                "count": 6,
                "title": [{
                    "type": "source",
                    "content": "CANCER"
                }, {
                    "type": "source_abbrev",
                    "content": "CANCER"
                }, {
                    "type": "abbrev_iso",
                    "content": "Cancer"
                }, {
                    "type": "abbrev_11",
                    "content": "CANCER"
                }, {
                    "type": "abbrev_29",
                    "content": "CANCER"
                }, {
                    "type": "item",
                    "content": "BREAST EDEMA FOLLOWING STAGING AXILLARY NODE DISSECTION IN PATIENTS WITH BREAST-CARCINOMA TREATED BY RADICAL RADIOTHERAPY"
                }]
            }
        },
        "item": {
            "xsi:type": "itemType_wos",
            "coll_id": "WOS",
            "ids": {
                "avail": "Y",
                "content": "NQ889"
            },
            "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
            "bib_id": "49 (11): 2295-2299 1982"
        },
        "fullrecord_metadata": {
            "addresses": {
                "count": 1,
                "address_name": {
                    "address_spec": {
                        "zip": {
                            "location": "AP",
                            "content": 94305
                        },
                        "country": "USA",
                        "city": "STANFORD",
                        "street": "PAUL A BISSINGER MEM CTR RADIAT THERAPY, SCH MED, DEPT RADIOL",
                        "addr_no": 1,
                        "organizations": {
                            "organization": ["STANFORD UNIV", {
                                "pref": "Y",
                                "content": "Stanford University"
                            }],
                            "count": 2
                        },
                        "full_address": "STANFORD UNIV,PAUL A BISSINGER MEM CTR RADIAT THERAPY,SCH MED,DEPT RADIOL,STANFORD,CA 94305",
                        "state": "CA"
                    }
                }
            },
            "category_info": {
                "subheadings": {
                    "count": 1,
                    "subheading": "Life Sciences & Biomedicine"
                },
                "subjects": {
                    "subject": [{
                        "ascatype": "traditional",
                        "code": "DM",
                        "content": "Oncology"
                    }, {
                        "ascatype": "extended",
                        "content": "Oncology"
                    }],
                    "count": 2
                },
                "headings": {
                    "heading": "Science & Technology",
                    "count": 1
                }
            },
            "normalized_languages": {
                "count": 1,
                "language": {
                    "type": "primary",
                    "content": "English"
                }
            },
            "languages": {
                "count": 1,
                "language": {
                    "type": "primary",
                    "content": "English"
                }
            },
            "refs": {
                "count": 19
            },
            "fund_ack": {
                "grants": {
                    "count": 1,
                    "grant": {
                        "grant_ids": {
                            "grant_id": "CA-05838-19",
                            "count": 1
                        },
                        "grant_source": "Medline",
                        "grant_agency": "NCI NIH HHS"
                    }
                }
            },
            "normalized_doctypes": {
                "doctype": "Article",
                "count": 1
            }
        }
    },
    "r_id_disclaimer": "ResearcherID data provided by Clarivate Analytics",
    "dynamic_data": {
        "citation_related": {
            "tc_list": {
                "silo_tc": {
                    "coll_id": "WOS",
                    "local_count": 79
                }
            }
        },
        "cluster_related": {
            "identifiers": {
                "identifier": [{
                    "type": "issn",
                    "value": "0008-543X"
                }, {
                    "type": "xref_doi",
                    "value": "10.1002/1097-0142(19820601)49:11<2295::AID-CNCR2820491116>3.0.CO;2-G"
                }, {
                    "type": "pmid",
                    "value": "MEDLINE:7074546"
                }]
            }
        }
    }
}

This results in:

ERROR Statement #<RDF::Statement:0x44bafc4(<http://sul.stanford.edu/rialto/publications/88a14d7e3a5c02f94fa86cc4a58a55cc> <http://purl.org/ontology/bibo/doi> <https://doi.org/10.1002/1097-0142(19820601)49:11\u003C2295::AID-CNCR2820491116\u003E3.0.CO;2-G> .)> is invalid
Traceback (most recent call last):
        41: from exe/transform:8:in `<main>'
        40: from /opt/app/rialto/rialto/shared/bundle/ruby/2.5.0/gems/thor-0.20.3/lib/thor/base.rb:466:in `start'
        39: from /opt/app/rialto/rialto/shared/bundle/ruby/2.5.0/gems/thor-0.20.3/lib/thor.rb:387:in `dispatch'
        38: from /opt/app/rialto/rialto/shared/bundle/ruby/2.5.0/gems/honeybadger-4.1.0/lib/honeybadger/plugins/thor.rb:17:in `invoke_command_with_honeybadger'
        37: from /opt/app/rialto/rialto/shared/bundle/ruby/2.5.0/gems/thor-0.20.3/lib/thor/invocation.rb:126:in `invoke_command'
        36: from /opt/app/rialto/rialto/shared/bundle/ruby/2.5.0/gems/thor-0.20.3/lib/thor/command.rb:27:in `run'
...
         6: from /opt/app/rialto/rialto/shared/bundle/ruby/2.5.0/gems/rdf-3.0.6/lib/rdf/writer.rb:197:in `buffer'
         5: from /opt/app/rialto/rialto/shared/bundle/ruby/2.5.0/gems/rdf-3.0.6/lib/rdf/writer.rb:197:in `open'
         4: from /opt/app/rialto/rialto/shared/bundle/ruby/2.5.0/gems/rdf-3.0.6/lib/rdf/writer.rb:198:in `block in buffer'
         3: from /opt/app/rialto/rialto/shared/bundle/ruby/2.5.0/gems/rdf-3.0.6/lib/rdf/writer.rb:198:in `new'
         2: from /opt/app/rialto/rialto/shared/bundle/ruby/2.5.0/gems/rdf-3.0.6/lib/rdf/ntriples/writer.rb:193:in `initialize'
         1: from /opt/app/rialto/rialto/shared/bundle/ruby/2.5.0/gems/rdf-3.0.6/lib/rdf/writer.rb:273:in `initialize'
/opt/app/rialto/rialto/shared/bundle/ruby/2.5.0/gems/rdf-3.0.6/lib/rdf/writer.rb:403:in `write_epilogue': Errors found during processing (RDF::WriterError)

Prior to writing the SPARQL statement, the writer checks whether the statement is valid.

justinlittman commented 5 years ago

I think it is an error to treat a DOI as a URI, instead of a literal, as we do here:

to_field RDF::Vocab::BIBO.doi.to_s, lambda { |json, accumulator|
  doi = JsonPath.on(json, '$.dynamic_data.cluster_related.identifiers.identifier[?(@.type=="doi")].value').first ||
        JsonPath.on(json, '$.dynamic_data.cluster_related.identifiers.identifier[?(@.type=="xref_doi")].value').first
  accumulator << RDF::URI("https://doi.org/#{doi}") if doi
}, single: true

Based on BIBO, it seems this should be a literal:

    <!-- http://purl.org/ontology/bibo/doi -->

    <owl:DatatypeProperty rdf:about="&bibo;doi">
        <rdfs:subPropertyOf rdf:resource="&bibo;identifier"/>
        <rdfs:range rdf:resource="&rdfs;Literal"/>
        <rdfs:domain>
            <owl:Class>
                <owl:unionOf rdf:parseType="Collection">
                    <rdf:Description rdf:about="&bibo;Collection"/>
                    <rdf:Description rdf:about="&bibo;Document"/>
                </owl:unionOf>
            </owl:Class>
        </rdfs:domain>
    </owl:DatatypeProperty>

I would propose that we change to a literal in the transform and then makes any necessary changes in webapp to create a link from it.

mjgiarlo commented 5 years ago

@justinlittman :+1: That makes complete sense.