Open talonx opened 3 months ago
Hey @talonx! I'm here to help you with bug fixes, questions, and contributions. Let's tackle this issue together while we wait for a human maintainer.
To resolve the issue of your custom RSS parser outputting jumbled and out-of-order XHTML content, you should ensure that the parser processes the text nodes correctly and maintains the order of elements. Here are a few steps you can take:
Ensure Text Nodes are Closed Properly: The closeText
function should be called appropriately to handle text nodes correctly.
Maintain Order of Elements: Ensure that the parser maintains the order of elements as they appear in the RSS feed.
Handle Asynchronous Processing: If the parser processes chunks of data asynchronously, ensure that the chunks are processed in the correct order.
Here is a code snippet that highlights how to handle text nodes and maintain the order of elements:
function closeText(parser) {
parser.textNode = textopts(parser.opt, parser.textNode);
if (parser.textNode) emit(parser, 'ontext', parser.textNode);
parser.textNode = '';
}
function emitNode(parser, nodeType, data) {
if (parser.textNode) closeText(parser);
emit(parser, nodeType, data);
}
function processAsync() {
var chunk, err;
try {
if (this.remaining.length <= this.options.chunkSize) {
chunk = this.remaining;
this.remaining = '';
this.saxParser = this.saxParser.write(chunk);
return this.saxParser.close();
} else {
chunk = this.remaining.substr(0, this.options.chunkSize);
this.remaining = this.remaining.substr(this.options.chunkSize, this.remaining.length);
this.saxParser = this.saxParser.write(chunk);
return setImmediate(this.processAsync.bind(this));
}
} catch (error1) {
err = error1;
if (!this.saxParser.errThrown) {
this.saxParser.errThrown = true;
return this.emit(err);
}
}
}
Parser.prototype.reset = function () {
var attrkey, charkey, ontext, stack;
this.removeAllListeners();
this.saxParser = sax.parser(this.options.strict, {
trim: false,
normalize: false,
xmlns: this.options.xmlns
});
this.saxParser.errThrown = false;
this.saxParser.onerror = function (_this) {
return function (error) {
_this.saxParser.resume();
if (!_this.saxParser.errThrown) {
_this.saxParser.errThrown = true;
return _this.emit("error", error);
}
};
}(this);
this.saxParser.onend = function (_this) {
return function () {
if (!_this.saxParser.ended) {
_this.saxParser.ended = true;
return _this.emit("end", _this.resultObject);
}
};
}(this);
this.saxParser.ended = false;
this.EXPLICIT_CHARKEY = this.options.explicitCharkey;
this.resultObject = null;
stack = [];
attrkey = this.options.attrkey;
charkey = this.options.charkey;
this.saxParser.onopentag = function (_this) {
return function (node) {
var key, newValue, obj, processedKey, ref;
obj = Object.create(null);
obj[charkey] = "";
if (!_this.options.ignoreAttrs) {
ref = node.attributes;
for (key in ref) {
if (!hasProp.call(ref, key)) continue;
if (!(attrkey in obj) && !_this.options.mergeAttrs) {
obj[attrkey] = Object.create(null);
}
newValue = _this.options.attrValueProcessors ? processItem(_this.options.attrValueProcessors, node.attributes[key], key) : node.attributes[key];
processedKey = _this.options.attrNameProcessors ? processItem(_this.options.attrNameProcessors, key) : key;
if (_this.options.mergeAttrs) {
_this.assignOrPush(obj, processedKey, newValue);
} else {
obj[attrkey][processedKey] = newValue;
}
}
}
obj["#name"] = _this.options.tagNameProcessors ? processItem(_this.options.tagNameProcessors, node.name) : node.name;
if (_this.options.xmlns) {
obj[_this.options.xmlnskey] = {
uri: node.uri,
local: node.local
};
}
return stack.push(obj);
};
}(this);
this.saxParser.onclosetag = function (_this) {
return function () {
var cdata, emptyStr, key, node, nodeName, obj, objClone, old, s, xpath;
obj = stack.pop();
nodeName = obj["#name"];
if (!_this.options.explicitChildren || !_this.options.preserveChildrenOrder) {
delete obj["#name"];
}
if (obj.cdata === true) {
cdata = obj.cdata;
delete obj.cdata;
}
s = stack[stack.length - 1];
if (obj[charkey].match(/^\s*$/) && !cdata) {
emptyStr = obj[charkey];
delete obj[charkey];
} else {
if (_this.options.trim) {
obj[charkey] = obj[charkey].trim();
}
if (_this.options.normalize) {
obj[charkey] = obj[charkey].replace(/\s{2,}/g, " ").trim();
}
obj[charkey] = _this.options.valueProcessors ? processItem(_this.options.valueProcessors, obj[charkey], nodeName) : obj[charkey];
if (Object.keys(obj).length === 1 && charkey in obj && !_this.EXPLICIT_CHARKEY) {
obj = obj[charkey];
}
}
if (isEmpty(obj)) {
if (typeof _this.options.emptyTag === 'function') {
obj = _this.options.emptyTag();
} else {
obj = _this.options.emptyTag !== '' ? _this.options.emptyTag : emptyStr;
}
}
if (_this.options.validator != null) {
xpath = "/" + function () {
var i, len, results;
results = [];
for (i = 0, len = stack.length; i < len; i++) {
results.push(stack[i]["#name"]);
}
return results.join("/");
}();
obj = _this.options.validator(xpath, s, obj);
}
if (s) {
_this.assignOrPush(s, nodeName, obj);
} else {
_this.resultObject = obj;
_this.saxParser.ended = true;
return _this.emit("end", _this.resultObject);
}
};
}(this);
ontext = function (_this) {
return function (text) {
var charChild, s;
s = stack[stack.length - 1];
if (s) {
s[charkey] += text;
if (_this.options.explicitChildren && _this.options.preserveChildrenOrder && _this.options.charsAsChildren && (_this.options.includeWhiteChars || text.replace(/\n/g, '').trim() !== '')) {
s[_this.options.childkey] = s[_this.options.childkey] || [];
charChild = {
'#name': '__text__'
};
charChild[charkey] = text;
if (_this.options.normalize) {
charChild[charkey] = charChild[charkey].replace(/\s{2,}/g, " ").trim();
}
s[_this.options.childkey].push(charChild);
}
return s;
}
};
}(this);
this.saxParser.ontext = ontext;
return this.saxParser.oncdata = function (_this) {
return function (text) {
var s;
s = ontext(text);
if (s) {
return s.cdata = true;
}
};
}(this);
};
This code ensures that text nodes are closed properly and that the order of elements is maintained during parsing. Adjust the processAsync
function to handle asynchronous processing correctly, ensuring that chunks are processed in the correct order [1].
To continue talking to Dosu, mention @dosu.
hetzner.txt
After parsing when I attempt to access the content using
the content is mixed up like this
Notice the last two "in_progress"es squashed together, and the dates also.
The actual content looks like this (see attached file)
My parser is defined with a custom field as