taoqf / node-html-parser

A very fast HTML parser, generating a simplified DOM, with basic element query support.
MIT License
1.11k stars 107 forks source link

backslash-escaped doublequotes in attributes after removeAttribute #218

Closed milahu closed 1 year ago

milahu commented 2 years ago
const { parse } = require('node-html-parser')

var insrc = `\
<html>
  <div id="_" title='"world"' onClick='alert("hello")' color="red">nochange</div>
  <div id="e" title='"world"' color="red">expected</div>
  <div id="a" title='"world"' onClick='alert("hello")' color="red">actual</div>
</html>
`;

var root = parse(insrc);

root.querySelectorAll('#e').forEach(node => {
  node.setAttribute('onClick', "alert('hello')");
})

root.querySelectorAll('#a').forEach(node => {
  //node.setAttribute('title', '"replaced"');
  node.removeAttribute('color'); // FIXME
})

console.log(root.toString());

result

<html>
  <div id="_" title='"world"' onClick='alert("hello")' color="red">nochange</div>
  <div id="e" title="&quot;world&quot;" color="red" onClick="alert('hello')">expected</div>
  <div id="a" title="\"world\"" onClick="alert(\"hello\")">actual</div>
</html>

title="\"world\"" is invalid html

should be either title='"world"' (pretty, minimal diff) or title="&quot;world&quot;" (ugly, invasive)

milahu commented 1 year ago

similar problem: newlines in attribute values are escaped as \n but should be &#10;

failing test

diff --git a/test/tests/html.js b/test/tests/html.js
index 3bcf636..8d85725 100644
--- a/test/tests/html.js
+++ b/test/tests/html.js
@@ -391,6 +391,13 @@ describe('HTML Parser', function () {
                                p.getAttribute('c').should.eql('undefined');
                                p.toString().should.eql('<p a="12" b="null" c="undefined"></p>');
                        });
+                       it('should escape newlines to html entities', function () {
+                               const root = parseHTML('<p></p>');
+                               const p = root.firstChild;
+                               p.setAttribute('a', '1\n2');
+                               p.getAttribute('a').should.eql('1\n2');
+                               p.toString().should.eql('<p a="1&#10;2"></p>');
+                       });
                        it('should throw type Error', function () {
                                const root = parseHTML('<p a=12 b=13 c=14></p>');
                                const p = root.firstChild;
kasvith commented 1 year ago

this happens when using setAttributes also