Closed zufuliu closed 2 months ago
Minor update previous patch: html-comment-0401-2.zip
- i -= (ch == '>') ? 2 : 1;
+ if (chNext == '-') {
+ i += 1;
+ }
diff --git a/lexers/LexHTML.cxx b/lexers/LexHTML.cxx
index 0ed9f90b..3d3e08e4 100644
--- a/lexers/LexHTML.cxx
+++ b/lexers/LexHTML.cxx
@@ -1572,6 +1572,15 @@ void SCI_METHOD LexerHTML::Lex(Sci_PositionU startPos, Sci_Position length, int
state = SCE_H_COMMENT; // wait for a pending command
styler.ColourTo(i + 2, SCE_H_COMMENT);
i += 2; // follow styling after the --
+ chNext = SafeGetUnsignedCharAt(styler, i + 1);
+ if ((chNext == '>') || (chNext == '-' && SafeGetUnsignedCharAt(styler, i + 2) == '>')) {
+ // https://html.spec.whatwg.org/multipage/parsing.html#parse-error-abrupt-closing-of-empty-comment
+ if (chNext == '-') {
+ i += 1;
+ }
+ chPrev = '-';
+ ch = '-';
+ }
} else if (isWordCdata(i + 1, i + 7, styler)) {
state = SCE_H_CDATA;
} else {
@@ -1843,7 +1852,11 @@ void SCI_METHOD LexerHTML::Lex(Sci_PositionU startPos, Sci_Position length, int
}
break;
case SCE_H_COMMENT:
- if ((scriptLanguage != eScriptComment) && (chPrev2 == '-') && (chPrev == '-') && (ch == '>')) {
+ if ((scriptLanguage != eScriptComment) && (chPrev2 == '-') && (chPrev == '-') && (ch == '>' || (ch == '!' && chNext == '>'))) {
+ // https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment
+ if (ch == '!') {
+ i += 1;
+ }
styler.ColourTo(i, StateToPrint);
state = SCE_H_DEFAULT;
levelCurrent--;
The patch is still not ready as XML comment parsing may not follow the same rules.
Patch also needs an example file to ensure it stays fixed.
diff --git a/lexers/LexHTML.cxx b/lexers/LexHTML.cxx
index 0ed9f90b..c39ed0e4 100644
--- a/lexers/LexHTML.cxx
+++ b/lexers/LexHTML.cxx
@@ -1572,6 +1572,18 @@ void SCI_METHOD LexerHTML::Lex(Sci_PositionU startPos, Sci_Position length, int
state = SCE_H_COMMENT; // wait for a pending command
styler.ColourTo(i + 2, SCE_H_COMMENT);
i += 2; // follow styling after the --
+ if (!isXml) {
+ // handle empty comment: <!-->, <!--->
+ // https://html.spec.whatwg.org/multipage/parsing.html#parse-error-abrupt-closing-of-empty-comment
+ chNext = SafeGetUnsignedCharAt(styler, i + 1);
+ if ((chNext == '>') || (chNext == '-' && SafeGetUnsignedCharAt(styler, i + 2) == '>')) {
+ if (chNext == '-') {
+ i += 1;
+ }
+ chPrev = '-';
+ ch = '-';
+ }
+ }
} else if (isWordCdata(i + 1, i + 7, styler)) {
state = SCE_H_CDATA;
} else {
@@ -1843,7 +1855,12 @@ void SCI_METHOD LexerHTML::Lex(Sci_PositionU startPos, Sci_Position length, int
}
break;
case SCE_H_COMMENT:
- if ((scriptLanguage != eScriptComment) && (chPrev2 == '-') && (chPrev == '-') && (ch == '>')) {
+ if ((scriptLanguage != eScriptComment) && (chPrev2 == '-') && (chPrev == '-') && (ch == '>' || (!isXml && ch == '!' && chNext == '>'))) {
+ // close HTML comment with --!>
+ // https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment
+ if (ch == '!') {
+ i += 1;
+ }
styler.ColourTo(i, StateToPrint);
state = SCE_H_DEFAULT;
levelCurrent--;
test case (added <p>
after comment to test potential off-by-one error and make the result in web browser readable):
<!----><p>1 normal comment</p>
<!-- > and <!--><p>2 valid comment</p>
<!--><p>3 abrupt-closing-of-empty-comment</p>
<!---><p>4 abrupt-closing-of-empty-comment</p>
<!----!><p>5 incorrectly-closed-comment</p>
<!--!> <h1 value="--!><p>6 incorrectly-closed-comment</p>
<!--<!---><p>7 nested-comment</p>
<!--<!---!><p>8 nested-comment</p>
Updated patch to only handle these special comments in HTML as XML has strict comment parsing descripted at https://www.w3.org/TR/xml/#sec-comments
For compatibility, the string " -- " (double-hyphen) must not occur within comments. Note that the grammar does not allow a comment ending in
--->
.
Firefox reports "unclosed token" for following:
<?xml version="1.0" encoding="utf-8"?>
<root>
<!-->
</root>
<?xml version="1.0" encoding="utf-8"?>
<root>
<!--->
</root>
and "not well-formed" for following:
<?xml version="1.0" encoding="utf-8"?>
<root>
<!--<!--->
</root>
<?xml version="1.0" encoding="utf-8"?>
<root>
<!----!>
</root>
<?xml version="1.0" encoding="utf-8"?>
<root>
<!--<!---!>
</root>
The XML comment double-hyphen problem could be kept as is or handled in another issue.
See https://github.com/python/cpython/issues/102555, for following snippet, all text before number is treated as comment by browser:
currently only line 1 and line 2 are correctly handled.
https://html.spec.whatwg.org/multipage/parsing.html#parse-error-nested-comment
https://html.spec.whatwg.org/multipage/parsing.html#parse-error-abrupt-closing-of-empty-comment
https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment
Patch html-comment-0401.zip
More tests may needed as there are restrictions listed at https://html.spec.whatwg.org/multipage/syntax.html#comments