ScintillaOrg / lexilla

A library of language lexers for use with Scintilla
https://www.scintilla.org/Lexilla.html
Other
186 stars 67 forks source link

Backport TOML lexer from Notepad4 #261

Closed techee closed 3 months ago

techee commented 3 months ago

This patch adds the TOML lexer from the Notepad4 editor originally created by Zufu Liu, see https://github.com/zufuliu/notepad4/issues/806.

Some changes have been made to make it compile and work in Scintilla since Notepad4 contains a modified Scintilla version:

Full diff ```diff --- LexTOML_orig.cxx 2024-08-13 18:35:25.105202586 +0200 +++ LexTOML.cxx 2024-08-13 18:20:05.318285762 +0200 @@ -1,6 +1,10 @@ -// This file is part of Notepad4. -// See License.txt for details about distribution and modification. -//! Lexer for TOML. +// Scintilla source code edit control +/** @file LexTOML.cxx + ** Lexer for TOML language. + **/ +// Based on Zufu Liu's Notepad4 TOML lexer +// Modified for Scintilla by Jiri Techet, 2024 +// The License.txt file describes the conditions under which this software may be distributed. #include #include @@ -21,7 +25,33 @@ using namespace Lexilla; -namespace { +constexpr bool IsEOLChar(int ch) noexcept { + return ch == '\r' || ch == '\n'; +} + +constexpr bool IsHexDigit(int ch) noexcept { + return (ch >= '0' && ch <= '9') + || (ch >= 'A' && ch <= 'F') + || (ch >= 'a' && ch <= 'f'); +} + +constexpr bool IsIdentifierChar(int ch) noexcept { + return IsAlphaNumeric(ch) || ch == '_'; +} + +constexpr bool IsNumberContinue(int chPrev, int ch, int chNext) noexcept { + return ((ch == '+' || ch == '-') && (chPrev == 'e' || chPrev == 'E')) + || (ch == '.' && chNext != '.'); +} + +constexpr bool IsDecimalNumber(int chPrev, int ch, int chNext) noexcept { + return IsIdentifierChar(ch) || IsNumberContinue(chPrev, ch, chNext); +} + +constexpr bool IsISODateTime(int ch, int chNext) noexcept { + return ((ch == '+' || ch == '-' || ch == ':' || ch == '.') && IsADigit(chNext)) + || (ch == ' ' && (chNext == '+' || chNext == '-' || IsADigit(chNext))); +} struct EscapeSequence { int outerState = SCE_TOML_DEFAULT; @@ -50,15 +80,11 @@ }; constexpr bool IsTripleString(int state) noexcept { - return state > SCE_TOML_STRING_DQ; + return state == SCE_TOML_TRIPLE_STRING_SQ || state == SCE_TOML_TRIPLE_STRING_DQ; } constexpr bool IsDoubleQuoted(int state) noexcept { - if constexpr (SCE_TOML_STRING_DQ & 1) { - return state & true; - } else { - return (state & 1) == 0; - } + return state == SCE_TOML_STRING_DQ || state == SCE_TOML_TRIPLE_STRING_DQ; } constexpr int GetStringQuote(int state) noexcept { @@ -73,9 +99,47 @@ return IsIdentifierChar(ch) || ch == '-'; } -bool IsTOMLKey(StyleContext& sc, int braceCount, const WordList *kwList) { +constexpr bool IsWhiteSpace(int ch) noexcept { + return (ch == ' ') || ((ch >= 0x09) && (ch <= 0x0d)); +} + +static int GetDocNextChar(StyleContext& sc) noexcept { + if (!IsWhiteSpace(sc.ch)) { + return sc.ch; + } + if (!IsWhiteSpace(sc.chNext)) { + return sc.chNext; + } + for (Sci_Position pos = 2; ; pos++) { + const unsigned char chPos = sc.GetRelative(pos); + if (!IsWhiteSpace(chPos)) { + return chPos; + } + } +} + +static int GetLineNextChar(StyleContext& sc) noexcept { + if (!IsWhiteSpace(sc.ch)) { + return sc.ch; + } + if (static_cast(sc.currentPos) + 1 == sc.lineStartNext) { + return '\0'; + } + if (!IsWhiteSpace(sc.chNext)) { + return sc.chNext; + } + for (Sci_Position pos = 2; pos < sc.lineStartNext; pos++) { + const unsigned char chPos = sc.GetRelative(pos); + if (!IsWhiteSpace(chPos)) { + return chPos; + } + } + return '\0'; +} + +static bool IsTOMLKey(StyleContext& sc, int braceCount, const WordList *kwList) { if (braceCount) { - const int chNext = sc.GetDocNextChar(); + const int chNext = GetDocNextChar(sc); if (chNext == '=' || chNext == '.' || chNext == '-') { sc.ChangeState(SCE_TOML_KEY); return true; @@ -103,12 +167,12 @@ enum class TOMLKeyState { Unquoted = 0, - Literal, - Quoted, + Literal, // single-quoted + Quoted, // double-quoted End, }; -void ColouriseTOMLDoc(Sci_PositionU startPos, Sci_Position lengthDoc, int initStyle, LexerWordList keywordLists, Accessor &styler) { +static void ColouriseTOMLDoc(Sci_PositionU startPos, Sci_Position lengthDoc, int initStyle, WordList *keywordLists[], Accessor &styler) { int visibleChars = 0; int tableLevel = 0; int braceCount = 0; @@ -155,7 +219,7 @@ case SCE_TOML_IDENTIFIER: if (!IsIdentifierChar(sc.ch)) { - if (IsTOMLKey(sc, braceCount, &keywordLists[0])) { + if (IsTOMLKey(sc, braceCount, keywordLists[0])) { keyState = TOMLKeyState::Unquoted; continue; } @@ -208,12 +272,12 @@ if (sc.ch == ']') { sc.Forward(); } - const int chNext = sc.GetLineNextChar(); + const int chNext = GetLineNextChar(sc); if (chNext == '#') { sc.SetState(SCE_TOML_DEFAULT); } } else if (sc.state == SCE_TOML_KEY && !IsTOMLUnquotedKey(sc.ch)) { - const int chNext = sc.GetLineNextChar(); + const int chNext = GetLineNextChar(sc); if (!AnyOf(chNext, '\'', '\"', '.', '=')) { sc.ChangeState(SCE_TOML_ERROR); continue; @@ -234,9 +298,10 @@ sc.SetState(SCE_TOML_ESCAPECHAR); sc.Forward(); } - } else if (sc.ch == GetStringQuote(sc.state) && (!IsTripleString(sc.state) || sc.MatchNext())) { + } else if (sc.ch == GetStringQuote(sc.state) && + (!IsTripleString(sc.state) || (sc.Match(IsDoubleQuoted(sc.state) ? R"(""")" : R"(''')")))) { if (IsTripleString(sc.state)) { - sc.Advance(2); + sc.Forward(2); } sc.Forward(); if (!IsTripleString(sc.state) && IsTOMLKey(sc, braceCount, nullptr)) { @@ -299,16 +364,16 @@ } } else { if (sc.ch == '\'') { - if (sc.MatchNext('\'', '\'')) { + if (sc.Match(R"(''')")) { sc.SetState(SCE_TOML_TRIPLE_STRING_SQ); - sc.Advance(2); + sc.Forward(2); } else { sc.SetState(SCE_TOML_STRING_SQ); } } else if (sc.ch == '"') { - if (sc.MatchNext('"', '"')) { + if (sc.Match(R"(""")")) { sc.SetState(SCE_TOML_TRIPLE_STRING_DQ); - sc.Advance(2); + sc.Forward(2); } else { sc.SetState(SCE_TOML_STRING_DQ); } @@ -359,11 +424,11 @@ } // code folding based on LexProps -void FoldTOMLDoc(Sci_PositionU startPos, Sci_Position lengthDoc, int /*initStyle*/, LexerWordList /*keywordLists*/, Accessor &styler) { - const Sci_Line endPos = startPos + lengthDoc; - const Sci_Line maxLines = styler.GetLine((endPos == styler.Length()) ? endPos : endPos - 1); +static void FoldTOMLDoc(Sci_PositionU startPos, Sci_Position lengthDoc, int /*initStyle*/, WordList *[] /*keywordLists*/, Accessor &styler) { + const Sci_Position endPos = startPos + lengthDoc; + const Sci_Position maxLines = styler.GetLine((endPos == styler.Length()) ? endPos : endPos - 1); - Sci_Line lineCurrent = styler.GetLine(startPos); + Sci_Position lineCurrent = styler.GetLine(startPos); int prevLevel = SC_FOLDLEVELBASE; TOMLLineType prevType = TOMLLineType::None; @@ -425,6 +490,9 @@ } } -} +static const char *const tomlWordListDesc[] = { + "Keywords", + 0 +}; -extern const LexerModule lmTOML(SCLEX_TOML, ColouriseTOMLDoc, "toml", FoldTOMLDoc); +extern const LexerModule lmTOML(SCLEX_TOML, ColouriseTOMLDoc, "toml", FoldTOMLDoc, tomlWordListDesc); ```

Some of the utility functions were taken from other Notepad4 Scintilla files and changes have been made to adopt the lexer to the Scintilla API but nothing major.

I briefly checked the code of the lexer and things seem to look good in general (the IsTOMLKey() function is a bit confusing as it also sets state but I don't know how to name it better or how to rewrite it to avoid duplicated code). Specifically I checked whether inside ColouriseTOMLDoc() the lexer always advances forward to avoid hangups and all the branches seem to be alright.

For the test I used the various examples from https://toml.io/en/v1.0.0 - the only slightly incorrect output I noticed was at the end of

{6}str7 {8}={0} {12}""""This," she said, "is just a pointless statement."""{10}"

and

{6}str {8}={0} {11}''''That,' she said, 'is still pointless.'''{9}'

where the lexer takes the first triple-string delimiters to close the string even though in this case it should probably take the last 3 in the sequence. But I think it's not a big problem in practice and I didn't spend time fixing it.

Please let me know what is missing and what else should be modified to get this merged.

nyamatongwe commented 3 months ago

When I debug this in SciTE built with Visual C++ 2022 with the AllStyles.toml, there is a failure at the end of ColouriseTOMLDoc.

Run-Time Check Failure #2 - Stack around the variable 'escSeq' was corrupted.

This is commonly an array out-of-bounds write. It may be caused by the next issue.

To prevent identifier leakage and potentially bad links static should be avoided and all of the code, except for the final LexerModule declaration should be inside an unnamed namespace namespace { as is done for many other lexers including LexPython.cxx. After using an unnamed namespace the above problem stopped although that could be due to other changes I have made.

AllStyles.toml should include examples of styles SCE_TOML_IDENTIFIER (2) and SCE_TOML_ERROR (7) if they are possible. Adding the line testlexers.list.styles=1 to SciTE.properties will show the styles that were produced 0-1 3-6 8-14.

CharacterSet.h includes a IsAHeXDigit which can be used instead of a local IsHexDigit.

If a background colour is added to SCE_TOML_KEY (6), like style.toml.6=fore:#000080,bold,back:#FFD0D0, it shows trailing whitespace as part of the key. It looks weird to me but I'll accept it if that's what you want.

nyamatongwe commented 3 months ago

From Visual C++ Code Analysis:

G:\u\hg\pull\lexilla\lexers\LexTOML.cxx(116): warning C26447: The function is declared 'noexcept' but calls function 'GetRelative()' which may throw exceptions (f.6).
G:\u\hg\pull\lexilla\lexers\LexTOML.cxx(134): warning C26447: The function is declared 'noexcept' but calls function 'GetRelative()' which may throw exceptions (f.6).
zufuliu commented 3 months ago

@techee trailing quotes inside triple-quoted string and trailing space after key were fixed in Notepad4, you can sync the changes.

nyamatongwe commented 3 months ago

Fairly sure that a link clash between EscapeSequence (or its methods) in LexJSON.cxx and LexTOML.cxx was the cause of the 'Run-Time Check Failure' since replacing EscapeSequence with XEscapeSequence in LexTOML.cxx fixed it. C++ doesn't allow specifying a struct is static so an unnamed namespace should be used.

The g++ crash was with a debug version of GetLineState which throws for out-of-bounds instead of returning 0.

zufuliu commented 3 months ago

The g++ crash was with a debug version of GetLineState which throws for out-of-bounds instead of returning 0.

It's the difference between Lexilla's TestDocument::GetLineState() and Scintilla's LineState::GetLineState():

int SCI_METHOD TestDocument::GetLineState(Sci_Position line) const {
    return lineStates.at(line);
}

int LineState::GetLineState(Sci::Line line) {
    if (line < 0)
        return 0;
    lineStates.EnsureLength(line + 1);
    return lineStates[line];
}
techee commented 3 months ago

@nyamatongwe I believe I (together with fixes from @zufuliu, thanks!) addressed all the issues. Let me know if anything is missing.

nyamatongwe commented 3 months ago

Committed in squashed form with a change log item.

zufuliu commented 3 months ago

@techee I added another commit that fix examples from https://toml.io/en/v1.0.0#keys

fruit.name = "banana"     # this is best practice
fruit. color = "yellow"    # same as fruit.color
fruit . flavor = "banana"   # same as fruit.flavor

though background color for spaces around dot is not fixed.

techee commented 3 months ago

@techee I added another commit that fix examples from https://toml.io/en/v1.0.0#keys

Thanks, I noticed that problem too but didn't include it into the unit test since based on the documentation this style is discouraged.

I'll add your commit and update the unit test with it.

techee commented 3 months ago

See https://github.com/ScintillaOrg/lexilla/pull/262