shepmaster / sxd-xpath

An XPath library in Rust
Apache License 2.0
121 stars 34 forks source link

This might be a bug or more likely something I dont understand :) #133

Open JacobSandin opened 4 years ago

JacobSandin commented 4 years ago

Ok so im trying to parse MARC 21 XML reccords. And I might mention I do this to learn rust, and MARC wich both are quite complicated.

However Ive got tvo examples one that work and one where I change this:

    <record
            xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
            xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd"
            xmlns="http://www.loc.gov/MARC21/slim">    

to simply

The later works and the first one just return empty :

Example1:

fn do_something_with_metadata() {

    let package = parser::parse(r#"<?xml version="1.0" encoding="UTF-8"?>
    <record
            xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
            xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd"
            xmlns="http://www.loc.gov/MARC21/slim">    

      <leader>00446nim a22001813  45  </leader>
      <controlfield tag="001">11469144</controlfield>
      <controlfield tag="003">SE-LIBR</controlfield>
      <controlfield tag="005">20171005164357.0</controlfield>
      <controlfield tag="007">sd|||||||||||||||||||||</controlfield>
      <controlfield tag="008">091027s2001    xx |||_j_|||||_|| _|swe||</controlfield>
      <datafield tag="035" ind1=" " ind2=" ">
        <subfield code="a">(LibraSE)24008</subfield>
      </datafield>
      <datafield tag="084" ind1=" " ind2=" ">
        <subfield code="a">Yq</subfield>
      </datafield>
      <datafield tag="245" ind1="1" ind2="0">
        <subfield code="a">Astrid Lindgrens favoriter: CD 1 Sånger</subfield>
      </datafield>
      <datafield tag="260" ind1=" " ind2=" ">
        <subfield code="a">Stockholm :</subfield>
        <subfield code="b">Bonnier Music,</subfield>
        <subfield code="c">2001</subfield>
      </datafield>
      <datafield tag="300" ind1=" " ind2=" ">
        <subfield code="a">1 CD</subfield>
      </datafield>
      <datafield tag="653" ind1=" " ind2=" ">
        <subfield code="a">CD-bok</subfield>
      </datafield>
      <datafield tag="653" ind1=" " ind2=" ">
        <subfield code="a">Barn och ungdom</subfield>
      </datafield>
      <datafield tag="852" ind1=" " ind2=" ">
        <subfield code="h">Hcf/LC</subfield>
        <subfield code="l">AST</subfield>
      </datafield>
      <datafield tag="942" ind1=" " ind2=" ">
        <subfield code="c">BARN LJUD</subfield>
      </datafield>
      <datafield tag="999" ind1=" " ind2=" ">
        <subfield code="c">9782</subfield>
        <subfield code="d">9782</subfield>
      </datafield>
    </record>"#).expect("failed to parse XML");
    let document = package.as_document();
    let value = evaluate_xpath(&document, "/record/leader").expect("XPath evaluation failed");

    println!("Found: {}({:?})", value.string(),value);

}

Output:

Found: (Nodeset(Nodeset { nodes: {} }))
Found: (Nodeset(Nodeset { nodes: {} }))
Found: (Nodeset(Nodeset { nodes: {} }))
Found: (Nodeset(Nodeset { nodes: {} }))
Found: (Nodeset(Nodeset { nodes: {} }))

Example2:

fn do_something_with_metadata() {

    let package = parser::parse(r#"<?xml version="1.0" encoding="UTF-8"?>
    <record>

      <leader>00446nim a22001813  45  </leader>
      <controlfield tag="001">11469144</controlfield>
      <controlfield tag="003">SE-LIBR</controlfield>
      <controlfield tag="005">20171005164357.0</controlfield>
      <controlfield tag="007">sd|||||||||||||||||||||</controlfield>
      <controlfield tag="008">091027s2001    xx |||_j_|||||_|| _|swe||</controlfield>
      <datafield tag="035" ind1=" " ind2=" ">
        <subfield code="a">(LibraSE)24008</subfield>
      </datafield>
      <datafield tag="084" ind1=" " ind2=" ">
        <subfield code="a">Yq</subfield>
      </datafield>
      <datafield tag="245" ind1="1" ind2="0">
        <subfield code="a">Astrid Lindgrens favoriter: CD 1 Sånger</subfield>
      </datafield>
      <datafield tag="260" ind1=" " ind2=" ">
        <subfield code="a">Stockholm :</subfield>
        <subfield code="b">Bonnier Music,</subfield>
        <subfield code="c">2001</subfield>
      </datafield>
      <datafield tag="300" ind1=" " ind2=" ">
        <subfield code="a">1 CD</subfield>
      </datafield>
      <datafield tag="653" ind1=" " ind2=" ">
        <subfield code="a">CD-bok</subfield>
      </datafield>
      <datafield tag="653" ind1=" " ind2=" ">
        <subfield code="a">Barn och ungdom</subfield>
      </datafield>
      <datafield tag="852" ind1=" " ind2=" ">
        <subfield code="h">Hcf/LC</subfield>
        <subfield code="l">AST</subfield>
      </datafield>
      <datafield tag="942" ind1=" " ind2=" ">
        <subfield code="c">BARN LJUD</subfield>
      </datafield>
      <datafield tag="999" ind1=" " ind2=" ">
        <subfield code="c">9782</subfield>
        <subfield code="d">9782</subfield>
      </datafield>
    </record>"#).expect("failed to parse XML");
    let document = package.as_document();
    let value = evaluate_xpath(&document, "/record/leader").expect("XPath evaluation failed");

    println!("Found: {}({:?})", value.string(),value);

}

Returns:

Found: 00446nim a22001813  45  (Nodeset(Nodeset { nodes: {Element(Element { name: QName { namespace_uri: None, local_part: "leader" } })} }))
Found: 00446nim a22001813  45  (Nodeset(Nodeset { nodes: {Element(Element { name: QName { namespace_uri: None, local_part: "leader" } })} }))
Found: 00446nim a22001813  45  (Nodeset(Nodeset { nodes: {Element(Element { name: QName { namespace_uri: None, local_part: "leader" } })} }))
Found: 00446nim a22001813  45  (Nodeset(Nodeset { nodes: {Element(Element { name: QName { namespace_uri: None, local_part: "leader" } })} }))
Found: 00446nim a22001813  45  (Nodeset(Nodeset { nodes: {Element(Element { name: QName { namespace_uri: None, local_part: "leader" } })} }))

Im not sure why it dont lake the part of record, it just dont :)

Kind regards /Jacob

tim-weis commented 4 years ago

Glad I'm not the only one who tripped over XML namespaces. You'll have to set up a Context and assign the appropriate namespace(s) to it. Something like the following should work:

use sxd_document::parser;
use sxd_xpath::{Context, Factory};

fn main() {
    let package = parser::parse(r#"<?xml version="1.0" encoding="UTF-8"?>
    <record
            xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
            xsi:schemaLocation="http://www.loc.gov/MARC21/slim
                                http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd"
            xmlns="http://www.loc.gov/MARC21/slim">    

      <leader>00446nim a22001813  45  </leader>
    </record>"#).expect("failed to parse XML");

    let document = package.as_document();

    let factory = Factory::new();
    let xpath = factory.build("/ns:record/ns:leader").unwrap().unwrap();

    let mut context = Context::new();
    context.set_namespace("ns", "http://www.loc.gov/MARC21/slim");

    let value = xpath.evaluate(&context, document.root()).unwrap();

    println!("Found: {}({:?})", value.string(), value);
}

That returns

Found: 00446nim a22001813  45  (Nodeset(Nodeset { nodes: {Element(Element { name: QName { namespace_uri: Some("http://www.loc.gov/MARC21/slim"), local_part: "leader" } })} }))