rbren / rss-parser

A lightweight RSS parser, for Node and the browser
MIT License
1.37k stars 210 forks source link

XHTML content is parsed out of order? #275

Open talonx opened 2 months ago

talonx commented 2 months ago

hetzner.txt

After parsing when I attempt to access the content using

feedItem.contentSnippet

the content is mixed up like this

'Start: 2024-08-06T08:45:00+00:00 Estimated end: 2024-08-08T13:00:00+00:00 We\n' +
    'will be performing routine maintenance work on cloud load balancers in Helsinki.\n' +
    'During this maintenance work, there may be a short connection loss from the\n' +
    'active connections (TCP and HTTP) to the load balancers, or from the load\n' +
    'balancers to their targets. Unfortunately, the maintenance work is taking longer\n' +
    'than we planned. Thank you for your understanding. We have now started the\n' +
    'maintenance work.\n' +
    'In_progressIn_progress2024-08-07T15:33:29+00:002024-08-06T08:45:27+00:00',

Notice the last two "in_progress"es squashed together, and the dates also.

The actual content looks like this (see attached file)

    <strong>In_progress</strong>
    <small>2024-08-07T15:33:29+00:00</small>
    <p>Unfortunately, the maintenance work is taking longer than we planned. Thank you for your understanding.</p>
    <strong>In_progress</strong>
    <small>2024-08-06T08:45:27+00:00</small>
    <p>We have now started the maintenance work. </p>

My parser is defined with a custom field as

type HetznerItem = { category: { term: string } };
const hetznerParser: RSSParser = new RSSParser(new Parser<{}, HetznerItem>(
    {
        customFields: {
            item: ['category'],
        }
    }
));
dosubot[bot] commented 2 months ago

Hey @talonx! I'm here to help you with bug fixes, questions, and contributions. Let's tackle this issue together while we wait for a human maintainer.

To resolve the issue of your custom RSS parser outputting jumbled and out-of-order XHTML content, you should ensure that the parser processes the text nodes correctly and maintains the order of elements. Here are a few steps you can take:

  1. Ensure Text Nodes are Closed Properly: The closeText function should be called appropriately to handle text nodes correctly.

  2. Maintain Order of Elements: Ensure that the parser maintains the order of elements as they appear in the RSS feed.

  3. Handle Asynchronous Processing: If the parser processes chunks of data asynchronously, ensure that the chunks are processed in the correct order.

Here is a code snippet that highlights how to handle text nodes and maintain the order of elements:

function closeText(parser) {
    parser.textNode = textopts(parser.opt, parser.textNode);
    if (parser.textNode) emit(parser, 'ontext', parser.textNode);
    parser.textNode = '';
}

function emitNode(parser, nodeType, data) {
    if (parser.textNode) closeText(parser);
    emit(parser, nodeType, data);
}

function processAsync() {
    var chunk, err;
    try {
        if (this.remaining.length <= this.options.chunkSize) {
            chunk = this.remaining;
            this.remaining = '';
            this.saxParser = this.saxParser.write(chunk);
            return this.saxParser.close();
        } else {
            chunk = this.remaining.substr(0, this.options.chunkSize);
            this.remaining = this.remaining.substr(this.options.chunkSize, this.remaining.length);
            this.saxParser = this.saxParser.write(chunk);
            return setImmediate(this.processAsync.bind(this));
        }
    } catch (error1) {
        err = error1;
        if (!this.saxParser.errThrown) {
            this.saxParser.errThrown = true;
            return this.emit(err);
        }
    }
}

Parser.prototype.reset = function () {
    var attrkey, charkey, ontext, stack;
    this.removeAllListeners();
    this.saxParser = sax.parser(this.options.strict, {
        trim: false,
        normalize: false,
        xmlns: this.options.xmlns
    });
    this.saxParser.errThrown = false;
    this.saxParser.onerror = function (_this) {
        return function (error) {
            _this.saxParser.resume();
            if (!_this.saxParser.errThrown) {
                _this.saxParser.errThrown = true;
                return _this.emit("error", error);
            }
        };
    }(this);
    this.saxParser.onend = function (_this) {
        return function () {
            if (!_this.saxParser.ended) {
                _this.saxParser.ended = true;
                return _this.emit("end", _this.resultObject);
            }
        };
    }(this);
    this.saxParser.ended = false;
    this.EXPLICIT_CHARKEY = this.options.explicitCharkey;
    this.resultObject = null;
    stack = [];
    attrkey = this.options.attrkey;
    charkey = this.options.charkey;
    this.saxParser.onopentag = function (_this) {
        return function (node) {
            var key, newValue, obj, processedKey, ref;
            obj = Object.create(null);
            obj[charkey] = "";
            if (!_this.options.ignoreAttrs) {
                ref = node.attributes;
                for (key in ref) {
                    if (!hasProp.call(ref, key)) continue;
                    if (!(attrkey in obj) && !_this.options.mergeAttrs) {
                        obj[attrkey] = Object.create(null);
                    }
                    newValue = _this.options.attrValueProcessors ? processItem(_this.options.attrValueProcessors, node.attributes[key], key) : node.attributes[key];
                    processedKey = _this.options.attrNameProcessors ? processItem(_this.options.attrNameProcessors, key) : key;
                    if (_this.options.mergeAttrs) {
                        _this.assignOrPush(obj, processedKey, newValue);
                    } else {
                        obj[attrkey][processedKey] = newValue;
                    }
                }
            }
            obj["#name"] = _this.options.tagNameProcessors ? processItem(_this.options.tagNameProcessors, node.name) : node.name;
            if (_this.options.xmlns) {
                obj[_this.options.xmlnskey] = {
                    uri: node.uri,
                    local: node.local
                };
            }
            return stack.push(obj);
        };
    }(this);
    this.saxParser.onclosetag = function (_this) {
        return function () {
            var cdata, emptyStr, key, node, nodeName, obj, objClone, old, s, xpath;
            obj = stack.pop();
            nodeName = obj["#name"];
            if (!_this.options.explicitChildren || !_this.options.preserveChildrenOrder) {
                delete obj["#name"];
            }
            if (obj.cdata === true) {
                cdata = obj.cdata;
                delete obj.cdata;
            }
            s = stack[stack.length - 1];
            if (obj[charkey].match(/^\s*$/) && !cdata) {
                emptyStr = obj[charkey];
                delete obj[charkey];
            } else {
                if (_this.options.trim) {
                    obj[charkey] = obj[charkey].trim();
                }
                if (_this.options.normalize) {
                    obj[charkey] = obj[charkey].replace(/\s{2,}/g, " ").trim();
                }
                obj[charkey] = _this.options.valueProcessors ? processItem(_this.options.valueProcessors, obj[charkey], nodeName) : obj[charkey];
                if (Object.keys(obj).length === 1 && charkey in obj && !_this.EXPLICIT_CHARKEY) {
                    obj = obj[charkey];
                }
            }
            if (isEmpty(obj)) {
                if (typeof _this.options.emptyTag === 'function') {
                    obj = _this.options.emptyTag();
                } else {
                    obj = _this.options.emptyTag !== '' ? _this.options.emptyTag : emptyStr;
                }
            }
            if (_this.options.validator != null) {
                xpath = "/" + function () {
                    var i, len, results;
                    results = [];
                    for (i = 0, len = stack.length; i < len; i++) {
                        results.push(stack[i]["#name"]);
                    }
                    return results.join("/");
                }();
                obj = _this.options.validator(xpath, s, obj);
            }
            if (s) {
                _this.assignOrPush(s, nodeName, obj);
            } else {
                _this.resultObject = obj;
                _this.saxParser.ended = true;
                return _this.emit("end", _this.resultObject);
            }
        };
    }(this);
    ontext = function (_this) {
        return function (text) {
            var charChild, s;
            s = stack[stack.length - 1];
            if (s) {
                s[charkey] += text;
                if (_this.options.explicitChildren && _this.options.preserveChildrenOrder && _this.options.charsAsChildren && (_this.options.includeWhiteChars || text.replace(/\n/g, '').trim() !== '')) {
                    s[_this.options.childkey] = s[_this.options.childkey] || [];
                    charChild = {
                        '#name': '__text__'
                    };
                    charChild[charkey] = text;
                    if (_this.options.normalize) {
                        charChild[charkey] = charChild[charkey].replace(/\s{2,}/g, " ").trim();
                    }
                    s[_this.options.childkey].push(charChild);
                }
                return s;
            }
        };
    }(this);
    this.saxParser.ontext = ontext;
    return this.saxParser.oncdata = function (_this) {
        return function (text) {
            var s;
            s = ontext(text);
            if (s) {
                return s.cdata = true;
            }
        };
    }(this);
};

This code ensures that text nodes are closed properly and that the order of elements is maintained during parsing. Adjust the processAsync function to handle asynchronous processing correctly, ensuring that chunks are processed in the correct order [1].

To continue talking to Dosu, mention @dosu.