ScintillaOrg / lexilla

A library of language lexers for use with Scintilla
https://www.scintilla.org/Lexilla.html
Other
186 stars 67 forks source link

Backport Dart lexer from Notepad4 #265

Closed techee closed 1 month ago

techee commented 3 months ago

This patch adds the Dart lexer from the Notepad4 editor originally created by Zufu Liu (@zufuliu).

Some changes have been made to make it compile and work in Scintilla since Notepad4 contains a modified Scintilla version. Also, some features of the Notepad4 lexer have been removed/changed:

The full diff is here:

Full diff ```diff --- notepad4/scintilla/lexers/LexDart.cxx 2024-08-21 11:44:51.924318156 +0200 +++ lexilla/lexers/LexDart.cxx 2024-08-20 23:22:14.215713622 +0200 @@ -1,6 +1,10 @@ -// This file is part of Notepad4. -// See License.txt for details about distribution and modification. -//! Lexer for Dart. +// Scintilla source code edit control +/** @file LexDart.cxx + ** Lexer for Dart. + **/ +// Based on Zufu Liu's Notepad4 Dart lexer +// Modified for Scintilla by Jiri Techet, 2024 +// The License.txt file describes the conditions under which this software may be distributed. #include #include @@ -18,13 +22,79 @@ #include "Accessor.h" #include "StyleContext.h" #include "CharacterSet.h" -#include "StringUtils.h" #include "LexerModule.h" -#include "LexerUtils.h" using namespace Lexilla; namespace { +// Use an unnamed namespace to protect the functions and classes from name conflicts + +constexpr bool IsEOLChar(int ch) noexcept { + return ch == '\r' || ch == '\n'; +} + +constexpr bool IsAGraphic(int ch) noexcept { + // excludes C0 control characters and whitespace + return ch > 32 && ch < 127; +} + +constexpr bool IsIdentifierChar(int ch) noexcept { + return IsAlphaNumeric(ch) || ch == '_'; +} + +constexpr bool IsIdentifierStart(int ch) noexcept { + return IsUpperOrLowerCase(ch) || ch == '_'; +} + +constexpr bool IsNumberContinue(int chPrev, int ch, int chNext) noexcept { + return ((ch == '+' || ch == '-') && (chPrev == 'e' || chPrev == 'E')) + || (ch == '.' && chNext != '.'); +} + +constexpr bool IsNumberStart(int ch, int chNext) noexcept { + return IsADigit(ch) || (ch == '.' && IsADigit(chNext)); +} + +constexpr bool IsDecimalNumber(int chPrev, int ch, int chNext) noexcept { + return IsIdentifierChar(ch) || IsNumberContinue(chPrev, ch, chNext); +} + +int PackLineState(const std::vector& states) noexcept { + // 2 bits for number of states + constexpr int countBits = 2; + // 6 bits for up to 3 stored states + constexpr int stateBits = 6; + size_t index = states.size(); + // storing at most 3 states + const int backCount = std::min(static_cast(index), 3); + int lineState = 0; + int count = backCount; + while (count != 0) { + --count; + --index; + int state = states[index]; + lineState = (lineState << stateBits) | state; + } + lineState = (lineState << countBits) | backCount; + return lineState; +} + +void UnpackLineState(int lineState, std::vector& states) { + // 2 bits for number of states + constexpr int countBits = 2; + // 6 bits for up to 3 stored states + constexpr int stateBits = 6; + constexpr int countMask = (1 << countBits) - 1; + constexpr int valueMask = (1 << stateBits) - 1; + int count = lineState & countMask; + lineState >>= countBits; + while (count != 0) { + int state = lineState & valueMask; + states.push_back(state); + lineState >>= stateBits; + --count; + } +} struct EscapeSequence { int outerState = SCE_DART_DEFAULT; @@ -43,7 +113,7 @@ } bool atEscapeEnd(int ch) noexcept { --digitsLeft; - return digitsLeft <= 0 || !IsHexDigit(ch); + return digitsLeft <= 0 || !IsAHeXDigit(ch); } }; @@ -52,29 +122,21 @@ DartLineStateMaskImport = (1 << 1), // import }; -//KeywordIndex++Autogenerated -- start of section automatically generated enum { - KeywordIndex_Keyword = 0, - KeywordIndex_Type = 1, - KeywordIndex_Class = 2, - KeywordIndex_Enumeration = 3, + KeywordIndex_Primary = 0, + KeywordIndex_Secondary = 1, + KeywordIndex_Tertiary = 2, + KeywordIndex_Type = 3, }; -//KeywordIndex--Autogenerated -- end of section automatically generated -enum class KeywordType { - None = SCE_DART_DEFAULT, - Label = SCE_DART_LABEL, - Class = SCE_DART_CLASS, - Enum = SCE_DART_ENUM, - Return = 0x40, - While = 0x41, +const char *const dartWordListDesc[] = { + "Primary keywords", + "Secondary keywords", + "Tertiary keywords", + "Global type definitions", + nullptr }; -static_assert(DefaultNestedStateBaseStyle + 1 == SCE_DART_STRING_SQ); -static_assert(DefaultNestedStateBaseStyle + 2 == SCE_DART_STRING_DQ); -static_assert(DefaultNestedStateBaseStyle + 3 == SCE_DART_TRIPLE_STRING_SQ); -static_assert(DefaultNestedStateBaseStyle + 4 == SCE_DART_TRIPLE_STRING_DQ); - constexpr bool IsDartIdentifierStart(int ch) noexcept { return IsIdentifierStart(ch) || ch == '$'; } @@ -90,33 +152,59 @@ } constexpr bool IsSpaceEquiv(int state) noexcept { - return state <= SCE_DART_TASKMARKER; + return state == SCE_DART_DEFAULT || + state == SCE_DART_COMMENTLINE || + state == SCE_DART_COMMENTLINEDOC || + state == SCE_DART_COMMENTBLOCK || + state == SCE_DART_COMMENTBLOCKDOC; } constexpr bool IsTripleString(int state) noexcept { - return ((state - SCE_DART_STRING_SQ) & 3) > 1; + return state == SCE_DART_TRIPLE_STRING_SQ || + state == SCE_DART_TRIPLE_STRING_DQ || + state == SCE_DART_TRIPLE_RAWSTRING_SQ || + state == SCE_DART_TRIPLE_RAWSTRING_DQ; +} + +constexpr bool IsDoubleQuoted(int state) noexcept { + return state == SCE_DART_STRING_DQ || + state == SCE_DART_RAWSTRING_DQ || + state == SCE_DART_TRIPLE_STRING_DQ || + state == SCE_DART_TRIPLE_RAWSTRING_DQ; +} + +constexpr bool IsRaw(int state) noexcept { + return state == SCE_DART_RAWSTRING_SQ || + state == SCE_DART_RAWSTRING_DQ || + state == SCE_DART_TRIPLE_RAWSTRING_SQ || + state == SCE_DART_TRIPLE_RAWSTRING_DQ; } constexpr int GetStringQuote(int state) noexcept { - if constexpr (SCE_DART_STRING_SQ & 1) { - return (state & 1) ? '\'' : '\"'; - } else { - return (state & 1) ? '\"' : '\''; - } + return IsDoubleQuoted(state) ? '\"' : '\''; +} + +Sci_PositionU LookbackNonWhite(LexAccessor &styler, Sci_PositionU startPos, int &chPrevNonWhite, int &stylePrevNonWhite) noexcept { + do { + --startPos; + const unsigned style = styler.StyleAt(startPos); + if (!IsSpaceEquiv(style)) { + stylePrevNonWhite = style; + chPrevNonWhite = static_cast(styler[startPos]); + break; + } + } while (startPos != 0); + return startPos; } -void ColouriseDartDoc(Sci_PositionU startPos, Sci_Position lengthDoc, int initStyle, LexerWordList keywordLists, Accessor &styler) { +void ColouriseDartDoc(Sci_PositionU startPos, Sci_Position lengthDoc, int initStyle, WordList *keywordLists[], Accessor &styler) { int lineStateLineType = 0; int commentLevel = 0; // nested block comment level - KeywordType kwType = KeywordType::None; - int chBeforeIdentifier = 0; - std::vector nestedState; // string interpolation "${}" int visibleChars = 0; int chBefore = 0; - int visibleCharsBefore = 0; int chPrevNonWhite = 0; EscapeSequence escSeq; @@ -124,10 +212,10 @@ if (sc.currentLine > 0) { int lineState = styler.GetLineState(sc.currentLine - 1); /* - 2: lineStateLineType + 2: lineStateLineType - used by folding 6: commentLevel - 3: nestedState count - 3*4: nestedState + 2: nestedState count + 6*3: nestedState */ commentLevel = (lineState >> 2) & 0x3f; lineState >>= 8; @@ -143,13 +231,14 @@ lineStateLineType = DartLineStateMaskLineComment; } } else if (IsSpaceEquiv(initStyle)) { - LookbackNonWhite(styler, startPos, SCE_DART_TASKMARKER, chPrevNonWhite, initStyle); + LookbackNonWhite(styler, startPos, chPrevNonWhite, initStyle); + chBefore = chPrevNonWhite; } while (sc.More()) { switch (sc.state) { case SCE_DART_OPERATOR: - case SCE_DART_OPERATOR2: + case SCE_DART_OPERATOR_STRING: sc.SetState(SCE_DART_DEFAULT); break; @@ -159,11 +248,11 @@ } break; - case SCE_DART_SIMPLE_IDENTIFIER: case SCE_DART_IDENTIFIER: + case SCE_DART_IDENTIFIER_STRING: case SCE_DART_METADATA: case SCE_DART_SYMBOL_IDENTIFIER: - if (!IsDartIdentifierChar(sc.ch) || (sc.ch == '$' && sc.state == SCE_DART_SIMPLE_IDENTIFIER)) { + if (!IsDartIdentifierChar(sc.ch) || (sc.ch == '$' && sc.state == SCE_DART_IDENTIFIER_STRING)) { if (sc.state == SCE_DART_METADATA || sc.state == SCE_DART_SYMBOL_IDENTIFIER) { if (sc.ch == '.') { const int state = sc.state; @@ -175,75 +264,27 @@ char s[128]; sc.GetCurrent(s, sizeof(s)); const int state = sc.state; - if (keywordLists[KeywordIndex_Keyword].InList(s)) { - sc.ChangeState(SCE_DART_WORD); - if (state == SCE_DART_SIMPLE_IDENTIFIER) { - kwType = KeywordType::None; - } else if (StrEqualsAny(s, "import", "part")) { + if (state == SCE_DART_IDENTIFIER_STRING) { + sc.SetState(escSeq.outerState); + continue; + } else if (keywordLists[KeywordIndex_Primary]->InList(s)) { + sc.ChangeState(SCE_DART_KW_PRIMARY); + if (strcmp(s, "import") == 0 || strcmp(s, "part") == 0) { if (visibleChars == sc.LengthCurrent()) { lineStateLineType = DartLineStateMaskImport; } - } else if (StrEqualsAny(s, "class", "extends", "implements", "new", "throw", "with", "as", "is", "on")) { - kwType = KeywordType::Class; - } else if (StrEqual(s, "enum")) { - kwType = KeywordType::Enum; - } else if (StrEqualsAny(s, "break", "continue")) { - kwType = KeywordType::Label; - } else if (StrEqualsAny(s, "return", "await", "yield")) { - kwType = KeywordType::Return; - } - if (kwType > KeywordType::None && kwType < KeywordType::Return) { - const int chNext = sc.GetLineNextChar(); - if (!IsDartIdentifierStart(chNext)) { - kwType = KeywordType::None; - } } - } else if (keywordLists[KeywordIndex_Type].InList(s)) { - sc.ChangeState(SCE_DART_WORD2); - } else if (keywordLists[KeywordIndex_Class].InList(s)) { - sc.ChangeState(SCE_DART_CLASS); - } else if (keywordLists[KeywordIndex_Enumeration].InList(s)) { - sc.ChangeState(SCE_DART_ENUM); + } else if (keywordLists[KeywordIndex_Secondary]->InList(s)) { + sc.ChangeState(SCE_DART_KW_SECONDARY); + } else if (keywordLists[KeywordIndex_Tertiary]->InList(s)) { + sc.ChangeState(SCE_DART_KW_TERTIARY); + } else if (keywordLists[KeywordIndex_Type]->InList(s)) { + sc.ChangeState(SCE_DART_KW_TYPE); } else if (state == SCE_DART_IDENTIFIER && sc.ch == ':') { if (chBefore == ',' || chBefore == '{' || chBefore == '(') { sc.ChangeState(SCE_DART_KEY); // map key or named parameter - } else if (IsJumpLabelPrevChar(chBefore)) { - sc.ChangeState(SCE_DART_LABEL); - } - } else if (state == SCE_DART_IDENTIFIER && sc.ch != '.') { - if (kwType > KeywordType::None && kwType < KeywordType::Return) { - sc.ChangeState(static_cast(kwType)); - } else { - const int chNext = sc.GetLineNextChar(sc.ch == '?'); - if (chNext == '(') { - // type method() - // type[] method() - // type method() - if (kwType != KeywordType::Return && (IsDartIdentifierChar(chBefore) || chBefore == ']')) { - sc.ChangeState(SCE_DART_FUNCTION_DEFINITION); - } else { - sc.ChangeState(SCE_DART_FUNCTION); - } - } else if ((chBeforeIdentifier == '<' && (chNext == '>' || chNext == '<')) - || IsDartIdentifierStart(chNext)) { - // type - // type - // type> - // type - // class type implements interface, interface {} - // type identifier - // type? identifier - sc.ChangeState(SCE_DART_CLASS); - } } } - if (sc.state != SCE_DART_WORD && sc.ch != '.') { - kwType = KeywordType::None; - } - if (state == SCE_DART_SIMPLE_IDENTIFIER) { - sc.SetState(escSeq.outerState); - continue; - } } sc.SetState(SCE_DART_DEFAULT); @@ -260,8 +301,6 @@ case SCE_DART_COMMENTLINEDOC: if (sc.atLineStart) { sc.SetState(SCE_DART_DEFAULT); - } else { - HighlightTaskMarker(sc, visibleChars, visibleCharsBefore, SCE_DART_TASKMARKER); } break; @@ -276,8 +315,6 @@ } else if (sc.Match('/', '*')) { sc.Forward(); ++commentLevel; - } else if (HighlightTaskMarker(sc, visibleChars, visibleCharsBefore, SCE_DART_TASKMARKER)) { - continue; } break; @@ -291,7 +328,7 @@ case SCE_DART_TRIPLE_RAWSTRING_DQ: if (sc.atLineStart && !IsTripleString(sc.state)) { sc.SetState(SCE_DART_DEFAULT); - } else if (sc.ch == '\\' && sc.state < SCE_DART_RAWSTRING_SQ) { + } else if (sc.ch == '\\' && !IsRaw(sc.state)) { if (escSeq.resetEscapeState(sc.state, sc.chNext)) { sc.SetState(SCE_DART_ESCAPECHAR); sc.Forward(); @@ -301,29 +338,24 @@ sc.Forward(); } } - } else if (sc.ch == '$' && sc.state < SCE_DART_RAWSTRING_SQ) { + } else if (sc.ch == '$' && !IsRaw(sc.state)) { escSeq.outerState = sc.state; - sc.SetState(SCE_DART_OPERATOR2); + sc.SetState(SCE_DART_OPERATOR_STRING); sc.Forward(); if (sc.ch == '{') { nestedState.push_back(escSeq.outerState); } else if (sc.ch != '$' && IsDartIdentifierStart(sc.ch)) { - sc.SetState(SCE_DART_SIMPLE_IDENTIFIER); + sc.SetState(SCE_DART_IDENTIFIER_STRING); } else { // error sc.SetState(escSeq.outerState); continue; } - } else if (sc.ch == GetStringQuote(sc.state) && (!IsTripleString(sc.state) || sc.MatchNext())) { + } else if (sc.ch == GetStringQuote(sc.state) && + (!IsTripleString(sc.state) || (sc.Match(IsDoubleQuoted(sc.state) ? R"(""")" : R"(''')")))) { if (IsTripleString(sc.state)) { sc.Forward(2); } sc.Forward(); - if (sc.state <= SCE_DART_STRING_DQ && (chBefore == ',' || chBefore == '{')) { - const int chNext = sc.GetLineNextChar(); - if (chNext == ':') { - sc.ChangeState(SCE_DART_KEY); - } - } sc.SetState(SCE_DART_DEFAULT); } break; @@ -341,13 +373,15 @@ if (sc.state == SCE_DART_DEFAULT) { if (sc.ch == '/' && (sc.chNext == '/' || sc.chNext == '*')) { - visibleCharsBefore = visibleChars; const int chNext = sc.chNext; sc.SetState((chNext == '/') ? SCE_DART_COMMENTLINE : SCE_DART_COMMENTBLOCK); sc.Forward(2); if (sc.ch == chNext && sc.chNext != chNext) { - static_assert(SCE_DART_COMMENTLINEDOC - SCE_DART_COMMENTLINE == SCE_DART_COMMENTBLOCKDOC - SCE_DART_COMMENTBLOCK); - sc.ChangeState(sc.state + SCE_DART_COMMENTLINEDOC - SCE_DART_COMMENTLINE); + if (sc.state == SCE_DART_COMMENTLINE) { + sc.ChangeState(SCE_DART_COMMENTLINEDOC); + } else { + sc.ChangeState(SCE_DART_COMMENTBLOCKDOC); + } } if (chNext == '/') { if (visibleChars == 0) { @@ -360,19 +394,31 @@ } if (sc.ch == 'r' && (sc.chNext == '\'' || sc.chNext == '"')) { sc.SetState((sc.chNext == '\'') ? SCE_DART_RAWSTRING_SQ : SCE_DART_RAWSTRING_DQ); - sc.Forward(); - if (sc.MatchNext()) { - static_assert(SCE_DART_TRIPLE_RAWSTRING_SQ - SCE_DART_RAWSTRING_SQ == SCE_DART_TRIPLE_RAWSTRING_DQ - SCE_DART_RAWSTRING_DQ); - sc.ChangeState(sc.state + SCE_DART_TRIPLE_RAWSTRING_SQ - SCE_DART_RAWSTRING_SQ); + sc.Forward(2); + if (sc.chPrev == '\'' && sc.Match('\'', '\'')) { + sc.ChangeState(SCE_DART_TRIPLE_RAWSTRING_SQ); + sc.Forward(2); + } else if (sc.chPrev == '"' && sc.Match('"', '"')) { + sc.ChangeState(SCE_DART_TRIPLE_RAWSTRING_DQ); sc.Forward(2); } - } else if (sc.ch == '\'' || sc.ch == '"') { - sc.SetState((sc.ch == '\'') ? SCE_DART_STRING_SQ : SCE_DART_STRING_DQ); - chBefore = chPrevNonWhite; - if (sc.MatchNext()) { - static_assert(SCE_DART_TRIPLE_STRING_SQ - SCE_DART_STRING_SQ == SCE_DART_TRIPLE_STRING_DQ - SCE_DART_STRING_DQ); - sc.ChangeState(sc.state + SCE_DART_TRIPLE_STRING_DQ - SCE_DART_STRING_DQ); + continue; + } + if (sc.ch == '"') { + if (sc.Match(R"(""")")) { + sc.SetState(SCE_DART_TRIPLE_STRING_DQ); + sc.Forward(2); + } else { + chBefore = chPrevNonWhite; + sc.SetState(SCE_DART_STRING_DQ); + } + } else if (sc.ch == '\'') { + if (sc.Match(R"(''')")) { + sc.SetState(SCE_DART_TRIPLE_STRING_SQ); sc.Forward(2); + } else { + chBefore = chPrevNonWhite; + sc.SetState(SCE_DART_STRING_SQ); } } else if (IsNumberStart(sc.ch, sc.chNext)) { sc.SetState(SCE_DART_NUMBER); @@ -380,22 +426,22 @@ sc.SetState((sc.ch == '@') ? SCE_DART_METADATA : SCE_DART_SYMBOL_IDENTIFIER); } else if (IsDartIdentifierStart(sc.ch)) { chBefore = chPrevNonWhite; - if (chPrevNonWhite != '.') { - chBeforeIdentifier = chPrevNonWhite; - } sc.SetState(SCE_DART_IDENTIFIER); } else if (sc.ch == '#' && IsDeclarableOperator(sc.chNext)) { sc.SetState(SCE_DART_SYMBOL_OPERATOR); } else if (IsAGraphic(sc.ch)) { sc.SetState(SCE_DART_OPERATOR); if (!nestedState.empty()) { - sc.ChangeState(SCE_DART_OPERATOR2); if (sc.ch == '{') { nestedState.push_back(SCE_DART_DEFAULT); } else if (sc.ch == '}') { - const int outerState = TakeAndPop(nestedState); - sc.ForwardSetState(outerState); - continue; + const int outerState = nestedState.back(); + nestedState.pop_back(); + if (outerState != SCE_DART_DEFAULT) { + sc.ChangeState(SCE_DART_OPERATOR_STRING); + sc.ForwardSetState(outerState); + continue; + } } } } @@ -415,8 +461,6 @@ styler.SetLineState(sc.currentLine, lineState); lineStateLineType = 0; visibleChars = 0; - visibleCharsBefore = 0; - kwType = KeywordType::None; } sc.Forward(); } @@ -433,29 +477,24 @@ } }; -void FoldDartDoc(Sci_PositionU startPos, Sci_Position lengthDoc, int initStyle, LexerWordList /*keywordLists*/, Accessor &styler) { +void FoldDartDoc(Sci_PositionU startPos, Sci_Position lengthDoc, int initStyle, WordList *[] /*keywordLists*/, Accessor &styler) { const Sci_PositionU endPos = startPos + lengthDoc; - Sci_Line lineCurrent = styler.GetLine(startPos); + Sci_Position lineCurrent = styler.GetLine(startPos); FoldLineState foldPrev(0); int levelCurrent = SC_FOLDLEVELBASE; if (lineCurrent > 0) { levelCurrent = styler.LevelAt(lineCurrent - 1) >> 16; foldPrev = FoldLineState(styler.GetLineState(lineCurrent - 1)); - const Sci_PositionU bracePos = CheckBraceOnNextLine(styler, lineCurrent - 1, SCE_DART_OPERATOR, SCE_DART_TASKMARKER); - if (bracePos) { - startPos = bracePos + 1; // skip the brace - } } int levelNext = levelCurrent; FoldLineState foldCurrent(styler.GetLineState(lineCurrent)); Sci_PositionU lineStartNext = styler.LineStart(lineCurrent + 1); - lineStartNext = sci::min(lineStartNext, endPos); + lineStartNext = std::min(lineStartNext, endPos); char chNext = styler[startPos]; int styleNext = styler.StyleAt(startPos); int style = initStyle; - int visibleChars = 0; while (startPos < endPos) { const char ch = chNext; @@ -489,7 +528,7 @@ break; case SCE_DART_OPERATOR: - case SCE_DART_OPERATOR2: + case SCE_DART_OPERATOR_STRING: if (ch == '{' || ch == '[' || ch == '(') { levelNext++; } else if (ch == '}' || ch == ']' || ch == ')') { @@ -498,25 +537,13 @@ break; } - if (visibleChars == 0 && !IsSpaceEquiv(style)) { - ++visibleChars; - } if (startPos == lineStartNext) { const FoldLineState foldNext(styler.GetLineState(lineCurrent + 1)); - levelNext = sci::max(levelNext, SC_FOLDLEVELBASE); + levelNext = std::max(levelNext, SC_FOLDLEVELBASE); if (foldCurrent.lineComment) { levelNext += foldNext.lineComment - foldPrev.lineComment; } else if (foldCurrent.packageImport) { levelNext += foldNext.packageImport - foldPrev.packageImport; - } else if (visibleChars) { - const Sci_PositionU bracePos = CheckBraceOnNextLine(styler, lineCurrent, SCE_DART_OPERATOR, SCE_DART_TASKMARKER); - if (bracePos) { - levelNext++; - startPos = bracePos + 1; // skip the brace - style = SCE_DART_OPERATOR; - chNext = styler[startPos]; - styleNext = styler.StyleAt(startPos); - } } const int levelUse = levelCurrent; @@ -528,15 +555,14 @@ lineCurrent++; lineStartNext = styler.LineStart(lineCurrent + 1); - lineStartNext = sci::min(lineStartNext, endPos); + lineStartNext = std::min(lineStartNext, endPos); levelCurrent = levelNext; foldPrev = foldCurrent; foldCurrent = foldNext; - visibleChars = 0; } } } -} +} // unnamed namespace end -extern const LexerModule lmDart(SCLEX_DART, ColouriseDartDoc, "dart", FoldDartDoc); +extern const LexerModule lmDart(SCLEX_DART, ColouriseDartDoc, "dart", FoldDartDoc, dartWordListDesc); ```

Please let me know your opinion and what you think should be modified. For more discussion and clarification of the above points, see https://github.com/zufuliu/notepad4/issues/806#issuecomment-2286862292 and below.

If the troff lexer gets merged first, the lexer ID in this patch will have to be modified.

Fixes #58.

zufuliu commented 2 months ago

You can replace ugly PackLineState() and UnpackLineState() with my BacktrackToStart() (used by many lexers in Notepad4, Bash, Perl and Ruby lexers in Lexilla also uses backtracking), the code is more readable, following is diff against my current LexDart.cxx. interpolatingStack code is taken from issue #94.

diff --git a/scintilla/lexers/LexDart.cxx b/scintilla/lexers/LexDart.cxx
index 087350a7..70a7c096 100644
--- a/scintilla/lexers/LexDart.cxx
+++ b/scintilla/lexers/LexDart.cxx
@@ -20,7 +20,6 @@
 #include "CharacterSet.h"
 #include "StringUtils.h"
 #include "LexerModule.h"
-#include "LexerUtils.h"

 using namespace Lexilla;

@@ -50,6 +49,7 @@ struct EscapeSequence {
 enum {
    DartLineStateMaskLineComment = 1,   // line comment
    DartLineStateMaskImport = (1 << 1), // import
+   DartLineStateMaskInterpolation = (1 << 2),
 };

 //KeywordIndex++Autogenerated -- start of section automatically generated
@@ -70,10 +70,32 @@ enum class KeywordType {
    While = 0x41,
 };

-static_assert(DefaultNestedStateBaseStyle + 1 == SCE_DART_STRING_SQ);
-static_assert(DefaultNestedStateBaseStyle + 2 == SCE_DART_STRING_DQ);
-static_assert(DefaultNestedStateBaseStyle + 3 == SCE_DART_TRIPLE_STRING_SQ);
-static_assert(DefaultNestedStateBaseStyle + 4 == SCE_DART_TRIPLE_STRING_DQ);
+// string interpolating state
+struct InterpolatingState {
+   int state;
+   int braceCount;
+};
+
+void BacktrackToStart(const LexAccessor &styler, int stateMask, Sci_PositionU &startPos, Sci_Position &lengthDoc, int &initStyle) noexcept {
+   const Sci_Line currentLine = styler.GetLine(startPos);
+   if (currentLine != 0) {
+       Sci_Line line = currentLine - 1;
+       int lineState = styler.GetLineState(line);
+       while ((lineState & stateMask) != 0 && line != 0) {
+           --line;
+           lineState = styler.GetLineState(line);
+       }
+       if ((lineState & stateMask) == 0) {
+           ++line;
+       }
+       if (line != currentLine) {
+           const Sci_PositionU endPos = startPos + lengthDoc;
+           startPos = (line == 0)? 0 : styler.LineStart(line);
+           lengthDoc = endPos - startPos;
+           initStyle = (startPos == 0)? 0 : styler.StyleAt(startPos - 1);
+       }
+   }
+}

 constexpr bool IsDartIdentifierStart(int ch) noexcept {
    return IsIdentifierStart(ch) || ch == '$';
@@ -112,7 +134,7 @@ void ColouriseDartDoc(Sci_PositionU startPos, Sci_Position lengthDoc, int initSt
    KeywordType kwType = KeywordType::None;
    int chBeforeIdentifier = 0;

-   std::vector<int> nestedState; // string interpolation "${}"
+   std::vector<InterpolatingState> interpolatingStack;

    int visibleChars = 0;
    int chBefore = 0;
@@ -120,20 +142,15 @@ void ColouriseDartDoc(Sci_PositionU startPos, Sci_Position lengthDoc, int initSt
    int chPrevNonWhite = 0;
    EscapeSequence escSeq;

+   if (startPos != 0) {
+       // backtrack to the line starts expression inside interpolated string.
+       BacktrackToStart(styler, DartLineStateMaskInterpolation, startPos, lengthDoc, initStyle);
+   }
+
    StyleContext sc(startPos, lengthDoc, initStyle, styler);
    if (sc.currentLine > 0) {
-       int lineState = styler.GetLineState(sc.currentLine - 1);
-       /*
-       2: lineStateLineType
-       6: commentLevel
-       3: nestedState count
-       3*4: nestedState
-       */
-       commentLevel = (lineState >> 2) & 0x3f;
-       lineState >>= 8;
-       if (lineState) {
-           UnpackLineState(lineState, nestedState);
-       }
+       const int lineState = styler.GetLineState(sc.currentLine - 1);
+       commentLevel = lineState >> 4;
    }
    if (startPos == 0) {
        if (sc.Match('#', '!')) {
@@ -306,7 +323,7 @@ void ColouriseDartDoc(Sci_PositionU startPos, Sci_Position lengthDoc, int initSt
                sc.SetState(SCE_DART_OPERATOR2);
                sc.Forward();
                if (sc.ch == '{') {
-                   nestedState.push_back(escSeq.outerState);
+                   interpolatingStack.push_back({sc.state, 1});
                } else if (sc.ch != '$' && IsDartIdentifierStart(sc.ch)) {
                    sc.SetState(SCE_DART_SIMPLE_IDENTIFIER);
                } else { // error
@@ -388,14 +405,18 @@ void ColouriseDartDoc(Sci_PositionU startPos, Sci_Position lengthDoc, int initSt
                sc.SetState(SCE_DART_SYMBOL_OPERATOR);
            } else if (IsAGraphic(sc.ch)) {
                sc.SetState(SCE_DART_OPERATOR);
-               if (!nestedState.empty()) {
-                   sc.ChangeState(SCE_DART_OPERATOR2);
+               if (!interpolatingStack.empty() && AnyOf(sc.ch, '{', '}')) {
+                   InterpolatingState &current = interpolatingStack.back();
                    if (sc.ch == '{') {
-                       nestedState.push_back(SCE_DART_DEFAULT);
-                   } else if (sc.ch == '}') {
-                       const int outerState = TakeAndPop(nestedState);
-                       sc.ForwardSetState(outerState);
-                       continue;
+                       current.braceCount += 1;
+                   } else {
+                       current.braceCount -= 1;
+                       if (current.braceCount == 0) {
+                           sc.ChangeState(SCE_DART_OPERATOR2);
+                           sc.ForwardSetState(current.state);
+                           interpolatingStack.pop_back();
+                           continue;
+                       }
                    }
                }
            }
@@ -408,9 +429,9 @@ void ColouriseDartDoc(Sci_PositionU startPos, Sci_Position lengthDoc, int initSt
            }
        }
        if (sc.atLineEnd) {
-           int lineState = (commentLevel << 2) | lineStateLineType;
-           if (!nestedState.empty()) {
-               lineState |= PackLineState(nestedState) << 8;
+           int lineState = (commentLevel << 4) | lineStateLineType;
+           if (!interpolatingStack.empty()) {
+               lineState |= DartLineStateMaskInterpolation;
            }
            styler.SetLineState(sc.currentLine, lineState);
            lineStateLineType = 0;
techee commented 2 months ago

You can replace ugly PackLineState() and UnpackLineState() with my BacktrackToStart() (used by many lexers in Notepad4, Bash, Perl and Ruby lexers in Lexilla also uses backtracking), the code is more readable, following is diff against my current LexDart.cxx. interpolatingStack code is taken from issue https://github.com/ScintillaOrg/lexilla/issues/94.

OK, thanks, will have a look at it.

techee commented 2 months ago

@zufuliu Looks good in principle, but I expect some problem in the implementation - for

var s = """This is also a
${foo(
"$bar"
)}
multi-line string.""";

which I hope is a valid Dart code testing this case I get

Screenshot 2024-08-21 at 20 36 44

where the last line isn't colorized as a string while with the previous version it was. I haven't investigated more yet (also, it's possible I made some mistake when porting your patch).

nyamatongwe commented 2 months ago

For portability, #include <algorithm> for std::min and std::max.

zufuliu commented 2 months ago

the last line isn't colorized as a string while with the previous version it was.

It's my copy paste error:

- interpolatingStack.push_back({sc.state, 1});
+ interpolatingStack.push_back({escSeq.outerState, 1});

It might better to rename IsDeclarableOperator to IsDefinableOperator, the latter is what currently used in the doc at https://github.com/dart-lang/sdk/blob/main/sdk/lib/core/symbol.dart#L31

techee commented 2 months ago

It's my copy paste error

@zufuliu Wonderful, thanks! (I'm getting really lazy with you around :-)

I've also added the code above to the unit test.

It might better to rename IsDeclarableOperator to IsDefinableOperator, the latter is what currently used in the doc at https://github.com/dart-lang/sdk/blob/main/sdk/lib/core/symbol.dart#L31

Done.

For portability, #include for std::min and std::max.

@nyamatongwe Done.

nyamatongwe commented 2 months ago

Some of the Notepad4 features are reasonable depending on implementation clarity. I haven't added some of the library functions from Notepad4 as their behaviour seemed too complex to explain to new lexer authors. Even including more functions and methods can make it more difficult to find the methods they really need. Many lexer authors have limited C++ experience.

folding has been modified to use simple folding like the rest of Scintilla lexers and not folding of the previous line based on brace presence on the next line like Notepad4

This may be an OK feature to add if it can be done cleanly since some people like this brace style and folding behaviour. However, it seems the Dart convention is braces on the keyword line so this won't matter.

"semi-syntactic" coloring of Notepad4 which colors words following a keyword (such as coloring Foo in "class Foo") has been removed as this is not performed in other Scintilla lexers

Some lexers have some of these features. LexPython knows that class and def are followed by class names and function names.

highlighting of tasks such as TODOs in comments has been removed as it isn't present in other Scintilla lexers

Again, some lexers like LexCPP do this.

It is not worthwhile making every lexer have all the features (and it is a lot of work) but it's OK to include these if the lexer author wants.

nyamatongwe commented 2 months ago
LexDart.cxx(159): warning C26447: The function is declared 'noexcept'
but calls function 'GetLine()' which may throw exceptions (f.6).
...
LexDart.cxx(185): warning C26447: The function is declared 'noexcept'
but calls function 'operator[]()' which may throw exceptions (f.6).

As there are non-ASCII characters "···" in the file, this should be communicated to SciTE (and thus Lexilla) with an initial comment:

// coding:utf-8
techee commented 2 months ago

This may be an OK feature to add if it can be done cleanly since some people like this brace style and folding behaviour. However, it seems the Dart convention is braces on the keyword line so this won't matter.

It's a matter of adding this function

https://github.com/zufuliu/notepad4/blob/fed47535dd5a2934fe17bc909c2087b9d50ac729/scintilla/lexlib/LexAccessor.cxx#L268

I eventually didn't include it because I'd have to study which of the cases are applicable to Dart and which are for other lexers and I think if such a feature is added, it should be applied to all the lexers for behavior consistency.

Some lexers have some of these features. LexPython knows that class and def are followed by class names and function names.

If desired, I can add it back - I think for function/class definitions it might be good. But the Notepad4 lexer also colorizes every function call (to be precise, anything followed by () which I found "too colorful" to my taste (of course one can map it to the default color but I think other lexers really don't do such a thing).

Again, some lexers like LexCPP do this.

It would mean adding

https://github.com/zufuliu/notepad4/blob/fed47535dd5a2934fe17bc909c2087b9d50ac729/scintilla/lexlib/StyleContext.cxx#L126

(plus the two small functions above it) to both the Dart and Zig lexers which would mean some code duplication. Similarly to the folding case, if this feature is added, I believe the function should be usable by all the lexers.

techee commented 2 months ago

As there are non-ASCII characters "···" in the file, this should be communicated to SciTE (and thus Lexilla) with an initial comment:

Done. I guess it should be also added to the already merged TOML lexer test, right?

nyamatongwe commented 2 months ago

It would mean adding ... HighlightTaskMarker

That has a hard-coded list of task markers which seems very limited compared to the cpp lexer's user chosen task markers.

nyamatongwe commented 2 months ago

There were some potential character encoding problems but it looks like, while Dart source encoding is a bit under-specified, non-ASCII characters should only be in string literals and comments. That is, according to this issue.

nyamatongwe commented 2 months ago

There are inconsistent types for styles, mostly storing into an int (once unsigned) from a char value from LexAccessor::StyleAt. If int is to be the type for styles, then LexAccessor::StyleIndexAt should be called.

nyamatongwe commented 2 months ago

I guess it should be also added to the already merged TOML lexer test, right?

OK, 6ac5d7e.

zufuliu commented 2 months ago

Changes similar to following can fix code folding for triple quoted string when escape sequence or interpolation occurs at line start, complex change is required when SCE_DART_IDENTIFIER_STRING is classified (e.g. $this, $ClassName), so the bug remains in Notepad4.

case SCE_DART_TRIPLE_RAWSTRING_SQ:
case SCE_DART_TRIPLE_RAWSTRING_DQ:
case SCE_DART_TRIPLE_STRING_SQ:
case SCE_DART_TRIPLE_STRING_DQ:
    if (style != stylePrev && !AnyOf(stylePrev, SCE_DART_ESCAPECHAR, SCE_DART_OPERATOR_STRING, SCE_DART_IDENTIFIER_STRING)) {
        levelNext++;
    }
    if (style != styleNext && !AnyOf(stylePrev, SCE_DART_ESCAPECHAR, SCE_DART_OPERATOR_STRING, SCE_DART_IDENTIFIER_STRING)) {
        levelNext--;
    }
    break;
var s1 = """multi-line
\n
strings
""";

var s2 = """multi-line
$x
strings
""";

var s3 = """multi-line
${x}
strings
""";

Original code folding code come from issue #132, so Ruby still has same bug, but it's hard to fix.

%W(
#{1 + 1}
)
techee commented 2 months ago

There are inconsistent types for styles, mostly storing into an int (once unsigned) from a char value from LexAccessor::StyleAt. If int is to be the type for styles, then LexAccessor::StyleIndexAt should be called.

Fixed.

@nyamatongwe Should I squash the commits and rebase the branch on top of master to resolve the conflicts caused by previous troff lexer merge?

techee commented 2 months ago

Changes similar to following can fix code folding for triple quoted string when escape sequence or interpolation occurs at line start, complex change is required when SCE_DART_IDENTIFIER_STRING is classified (e.g. $this, $ClassName), so the bug remains in Notepad4.

That still produces incorrect output when you e.g. insert some text lines before the escape sequence:

var s1 = """multi-line
foo
bar
baz
\n
strings
""";

I think without some complex logic correctly taking care of interpolation nesting you'll always run into problems like this. So I kept the original version for simplicity.

nyamatongwe commented 2 months ago

Should I squash the commits and rebase the branch on top of master ...

While that can make things easier, I'll almost always make at least a couple more changes - a change log entry and add warning suppressions to cppcheck.suppress. So the pull request doesn't get applied directly and I commonly use WinMerge to apply a freshly checked out copy of the pull request against my main development directory.

zufuliu commented 2 months ago

That still produces incorrect output when you e.g. insert some text lines before the escape sequence:

- if (style != styleNext && !AnyOf(stylePrev,
+ if (style != styleNext && !AnyOf(styleNext,
case SCE_DART_TRIPLE_RAWSTRING_SQ:
case SCE_DART_TRIPLE_RAWSTRING_DQ:
case SCE_DART_TRIPLE_STRING_SQ:
case SCE_DART_TRIPLE_STRING_DQ:
    if (style != stylePrev && !AnyOf(stylePrev, SCE_DART_ESCAPECHAR, SCE_DART_OPERATOR_STRING, SCE_DART_IDENTIFIER_STRING)) {
        levelNext++;
    }
    if (style != styleNext && !AnyOf(styleNext, SCE_DART_ESCAPECHAR, SCE_DART_OPERATOR_STRING, SCE_DART_IDENTIFIER_STRING)) {
        levelNext--;
    }
    break;
techee commented 2 months ago

@zufuliu Thanks! I've just added a commit with this version.