openpreserve / jhove

File validation and characterisation.
http://jhove.openpreservation.org
Other
171 stars 79 forks source link

XmlModuleHandler does not correctly resolve relative entities #707

Open pwinckles opened 2 years ago

pwinckles commented 2 years ago

XmlModuleHandler does not correctly resolve relative entities in resolveEntity(). This problem can be easily fixed by setting the systemId on the returned InputSource. The following is a test case that demonstrates the problem (this test will fail):

package edu.harvard.hul.ois.jhove.module;

import edu.harvard.hul.ois.jhove.JhoveBase;
import edu.harvard.hul.ois.jhove.JhoveException;
import edu.harvard.hul.ois.jhove.RepInfo;
import org.junit.Before;
import org.junit.Test;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.nio.file.Paths;

import static org.junit.Assert.assertEquals;

public class XmlModuleTest {

    private static final String DC_XML = "<oai_dc:dc xmlns:oai_dc=\"http://www.openarchives.org/OAI/2.0/oai_dc/\"" +
            " xmlns:dc=\"http://purl.org/dc/elements/1.1/\"" +
            " xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"" +
            " xsi:schemaLocation=\"http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd\">\n" +
            "  <dc:identifier>1234:example</dc:identifier>\n" +
            "</oai_dc:dc>";

    private XmlModule module;

    @Before
    public void setup() throws JhoveException {
        JhoveBase base = new JhoveBase();
        module = new XmlModule();
        module.setBase(base);
    }

    @Test
    public void validateXmlSucceedsWhenSchemaReferencesRelativeEntities() {
        RepInfo info = new RepInfo("uri:test");

        addSchema(URI.create("http://www.w3.org/2001/03/xml.xsd"),
                Paths.get("src/test/resources/edu/harvard/hul/ois/jhove/module/xml.xsd").toAbsolutePath());

        int parseIndex = module.parse(stream(DC_XML), info, 0);

        assertEquals(1, parseIndex);
        assertEquals(RepInfo.TRUE, info.getWellFormed());

        parseIndex = module.parse(stream(DC_XML), info, parseIndex);

        info.getMessage().forEach(message -> {
            System.out.println(String.format("%s: %s", message.getMessage(), message.getSubMessage()));
        });

        assertEquals(0, parseIndex);
        assertEquals(RepInfo.TRUE, info.getWellFormed());
        assertEquals(RepInfo.TRUE, info.getValid());
    }

    private InputStream stream(String xml) {
        return new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8));
    }

    private void addSchema(URI source, Path destination) {
        module.param(String.format("schema=%s;%s", source, destination.toString()));
    }

}

This test assumes that this schema is saved locally at src/test/resources/edu/harvard/hul/ois/jhove/module/xml.xsd.

Example solution to the problem: https://github.com/UW-Madison-Library/jhove/blob/uwdcc/jhove-modules/xml-hul/src/main/java/edu/harvard/hul/ois/jhove/module/xml/XmlModuleHandler.java#L382