jgm / skylighting

A Haskell syntax highlighting library with tokenizers derived from KDE syntax highlighting descriptions
189 stars 61 forks source link

Bug with Chinese character in HTML #110

Closed jgm closed 3 years ago

jgm commented 3 years ago
% skylighting -s html --trace -f native
试:<a>
Trying rule Rule {rMatcher = IncludeRules ("HTML","FindHTML"), rAttribute = NormalTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = []}
Trying rule Rule {rMatcher = DetectSpaces, rAttribute = NormalTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = []}
Trying rule Rule {rMatcher = DetectIdentifier, rAttribute = NormalTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = []}
DetectIdentifier MATCHED Just (NormalTok,"\35797")
IncludeRules MATCHED Just (NormalTok,"\35797")
Trying rule Rule {rMatcher = IncludeRules ("HTML","FindHTML"), rAttribute = NormalTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = []}
Trying rule Rule {rMatcher = DetectSpaces, rAttribute = NormalTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = []}
Trying rule Rule {rMatcher = DetectIdentifier, rAttribute = NormalTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = []}
DetectIdentifier MATCHED Just (NormalTok,"\65306<")
IncludeRules MATCHED Just (NormalTok,"\65306<")
Trying rule Rule {rMatcher = IncludeRules ("HTML","FindHTML"), rAttribute = NormalTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = []}
Trying rule Rule {rMatcher = DetectSpaces, rAttribute = NormalTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = []}
Trying rule Rule {rMatcher = DetectIdentifier, rAttribute = NormalTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = []}
DetectIdentifier MATCHED Just (NormalTok,"a")
IncludeRules MATCHED Just (NormalTok,"a")
Trying rule Rule {rMatcher = IncludeRules ("HTML","FindHTML"), rAttribute = NormalTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = []}
Trying rule Rule {rMatcher = DetectSpaces, rAttribute = NormalTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = []}
Trying rule Rule {rMatcher = DetectIdentifier, rAttribute = NormalTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = []}
Trying rule Rule {rMatcher = StringDetect "<!--", rAttribute = CommentTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","Comment")]}
Trying rule Rule {rMatcher = StringDetect "<![CDATA[", rAttribute = BaseNTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","CDATA")]}
Trying rule Rule {rMatcher = RegExpr (RE {reString = "<!DOCTYPE\\s+", reCaseSensitive = False}), rAttribute = DataTypeTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","Doctype")]}
Trying rule Rule {rMatcher = RegExpr (RE {reString = "<\\?[\\w:-]*", reCaseSensitive = True}), rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","PI")]}
Trying rule Rule {rMatcher = IncludeRules ("HTML","FindSpecialHTMLTags"), rAttribute = NormalTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = []}
Trying rule Rule {rMatcher = RegExpr (RE {reString = "<style\\b", reCaseSensitive = False}), rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","CSS")]}
Trying rule Rule {rMatcher = RegExpr (RE {reString = "<script\\b", reCaseSensitive = False}), rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","JS")]}
Trying rule Rule {rMatcher = IncludeRules ("HTML","FindHTMLTags"), rAttribute = NormalTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = []}
Trying rule Rule {rMatcher = WordDetect "<pre", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Open")]}
Trying rule Rule {rMatcher = WordDetect "<div", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Open")]}
Trying rule Rule {rMatcher = WordDetect "<table", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Open")]}
Trying rule Rule {rMatcher = WordDetect "<ul", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Open")]}
Trying rule Rule {rMatcher = WordDetect "<ol", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Open")]}
Trying rule Rule {rMatcher = WordDetect "<dl", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Open")]}
Trying rule Rule {rMatcher = WordDetect "<article", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Open")]}
Trying rule Rule {rMatcher = WordDetect "<aside", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Open")]}
Trying rule Rule {rMatcher = WordDetect "<details", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Open")]}
Trying rule Rule {rMatcher = WordDetect "<figure", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Open")]}
Trying rule Rule {rMatcher = WordDetect "<footer", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Open")]}
Trying rule Rule {rMatcher = WordDetect "<header", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Open")]}
Trying rule Rule {rMatcher = WordDetect "<main", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Open")]}
Trying rule Rule {rMatcher = WordDetect "<nav", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Open")]}
Trying rule Rule {rMatcher = WordDetect "<section", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Open")]}
Trying rule Rule {rMatcher = RegExpr (RE {reString = "<[A-Za-z_:][\\w.:_-]*", reCaseSensitive = True}), rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Open")]}
Trying rule Rule {rMatcher = WordDetect "</pre", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Close")]}
Trying rule Rule {rMatcher = WordDetect "</div", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Close")]}
Trying rule Rule {rMatcher = WordDetect "</table", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Close")]}
Trying rule Rule {rMatcher = WordDetect "</ul", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Close")]}
Trying rule Rule {rMatcher = WordDetect "</ol", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Close")]}
Trying rule Rule {rMatcher = WordDetect "</dl", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Close")]}
Trying rule Rule {rMatcher = WordDetect "</article", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Close")]}
Trying rule Rule {rMatcher = WordDetect "</aside", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Close")]}
Trying rule Rule {rMatcher = WordDetect "</details", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Close")]}
Trying rule Rule {rMatcher = WordDetect "</figure", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Close")]}
Trying rule Rule {rMatcher = WordDetect "</footer", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Close")]}
Trying rule Rule {rMatcher = WordDetect "</header", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Close")]}
Trying rule Rule {rMatcher = WordDetect "</main", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Close")]}
Trying rule Rule {rMatcher = WordDetect "</nav", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Close")]}
Trying rule Rule {rMatcher = WordDetect "</section", rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = False, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Close")]}
Trying rule Rule {rMatcher = RegExpr (RE {reString = "</[A-Za-z_:][\\w.:_-]*", reCaseSensitive = True}), rAttribute = KeywordTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","El Close")]}
Trying rule Rule {rMatcher = IncludeRules ("HTML","FindDTDRules"), rAttribute = NormalTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = []}
Trying rule Rule {rMatcher = RegExpr (RE {reString = "<!(?:ELEMENT|ENTITY|ATTLIST|NOTATION)\\b", reCaseSensitive = True}), rAttribute = DataTypeTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = [Push ("HTML","Doctype Markupdecl")]}
Trying rule Rule {rMatcher = IncludeRules ("HTML","FindEntityRefs"), rAttribute = NormalTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = []}
Trying rule Rule {rMatcher = RegExpr (RE {reString = "&(?:#[0-9]+|#[xX][0-9A-Fa-f]+|[A-Za-z_:][\\w.:_-]*);", reCaseSensitive = True}), rAttribute = DecValTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = []}
Trying rule Rule {rMatcher = AnyChar (fromList "&<"), rAttribute = ErrorTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = []}
FALLTHROUGH Just (NormalTok,">")
[ [ ( NormalTok , "\35797\65306<a>" ) ] ]
jgm commented 3 years ago

Problem is evidently here:

Trying rule Rule {rMatcher = DetectIdentifier, rAttribute = NormalTok, rIncludeAttribute = False, rDynamic = False, rCaseSensitive = True, rChildren = [], rLookahead = False, rFirstNonspace = False, rColumn = Nothing, rContextSwitch = []}
DetectIdentifier MATCHED Just (NormalTok,"\65306<")

The < should not be part of the identifier.

jgm commented 3 years ago

In the source code for detectIdentifier, it says: "NOTE: limited to ASCII as per kate documentation." That is undoubtedly the issue.

Have you tried this in the Kate editor?

jgm commented 3 years ago

Fixed!