FasterXML / woodstox

The gold standard Stax XML API implementation. Now at Github.
Apache License 2.0
220 stars 81 forks source link

Woodstox should provide option to leave predefined entities i.e. "<", ">", "'", """ and "&" as it is #177

Closed himanshuatgit closed 5 months ago

himanshuatgit commented 11 months ago
  1. It is observed that while parsing, Woodstox always converts predefined entities i.e. "<", ">", "'", """ and "&" to their respective character counterparts i.e. <, >, ', " and &.
  2. Although this conversion seems compulsory but there should be an option to leave these 5 predefined entities as it is.
  3. User should have liberty to utilize this feature in a customized way.
  4. If there is any other way this can be done, request you to share the relevant document.
package com.example.woodstox;

import org.codehaus.stax2.XMLInputFactory2;
import org.codehaus.stax2.XMLStreamReader2;

import javax.xml.stream.XMLStreamException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

public class Main {

    public static void main(String[] args) throws XMLStreamException {
        String text = "<schools>\n" +
                "\t<school>\n" +
                "\t\t<name>Test School</name>\n" +
                "\t\t<comment>&lt;!--TEST COMMENT--&gt;</comment>\n" +
                "\t</school>\n" +
                "</schools>";

        Reader reader = new StringReader(text);
        System.out.println(parse(reader).get(0).toString());
    }

    public static List<? extends School> parse(Reader reader) throws XMLStreamException {

        XMLStreamReader2 xmlStreamReader = null;
        XMLInputFactory2 xmlInputFactory = (XMLInputFactory2) XMLInputFactory2.newFactory();

        try {
            xmlStreamReader = (XMLStreamReader2) xmlInputFactory.createXMLStreamReader(reader);
            traverseUntilTrue(xmlStreamReader, streamReader -> isStartOfTag(streamReader, "schools"));

            final List<School> schools = new ArrayList<>();
            final StringBuilder sb = new StringBuilder();

            traverseUntilTrue(xmlStreamReader, streamReader1 -> {

                if (isEndOfTag(streamReader1, "schools"))
                    return true;
                if (isStartOfTag(streamReader1, "school")) {
                    School school = new School();
                    streamReader1.next();
                    traverseUntilTrue(streamReader1, streamReader2 -> {
                        if (isEndOfTag(streamReader2, "school"))
                            return true; 
                        if (isStartOfTag(streamReader2))
                            processTag(streamReader2, sb, school);
                        return false;
                    });
                    schools.add(school);
                }
                return false;
            });
            return schools;

        } finally {
            if (xmlStreamReader != null)
                xmlStreamReader.close();
        }

    }

    public static void processTag(XMLStreamReader2 streamReader, StringBuilder sb, Builder builder) throws XMLStreamException {

        final String tagName = streamReader.getLocalName();
        sb.setLength(0);

        traverseUntilTrue(streamReader, streamReader1 -> {
            switch (streamReader1.getEventType()) {
                case XMLStreamReader2.END_ELEMENT:
                    builder.schoolBuilder(tagName, sb);
                    return true;
                case XMLStreamReader2.CHARACTERS:
                    sb.append(streamReader1.getText());
                    break;
            }
            return false;
        });
    }

    public static boolean isStartOfTag(XMLStreamReader2 sr) {
        return XMLStreamReader2.START_ELEMENT == sr.getEventType();
    }

    public static boolean isStartOfTag(XMLStreamReader2 sr, String tagName) {
        return XMLStreamReader2.START_ELEMENT == sr.getEventType() && tagName.equalsIgnoreCase(sr.getLocalName());
    }

    public static boolean isEndOfTag(XMLStreamReader2 sr, String tagName) {
        return XMLStreamReader2.END_ELEMENT == sr.getEventType() && tagName.equalsIgnoreCase(sr.getLocalName());
    }

    public static void traverseUntilTrue(XMLStreamReader2 sr, Condition condition) throws XMLStreamException {
        do {
            if (condition.verifyCondition(sr))
                return;
        } while (sr.hasNext() && sr.next() >= 0);
        throw new IllegalStateException("Xml document ended without satisfying given condition");
    }

    @FunctionalInterface
    interface Condition {
        boolean verifyCondition(XMLStreamReader2 sr) throws XMLStreamException;
    }

    interface Builder {
        void schoolBuilder(String tagName, StringBuilder sb);
    }

    public static class School implements Builder {

        public String name;
        public String comment;

        public String toString(){
            return "name:" + name + ":comment:" + comment;
        }

        @Override
        public void schoolBuilder(String tagName, StringBuilder sb) {
            switch (tagName) {
                case "name":
                    this.name = sb.toString();
                    break;
                case "comment":
                    this.comment = sb.toString();
                    break;
            }
        }
    }
}
cowtowncoder commented 11 months ago

I am not interesting working on such feature (it is pretty much against XML specification mandates), but if you want to try implement that, feel free to. I will always consider contributions for adding configurable options, even for non-standards-compliant handling like this one.

You will have to dig in code for that tho, I don't have time to go dig through the code. I think StreamScanner.java would be a starting point.

cowtowncoder commented 5 months ago

No current plans; if anyone wants to work on this via PR we can re-open.