siefkenj / unified-latex

Utilities for parsing and manipulating LaTeX ASTs with the Unified.js framework
MIT License
85 stars 20 forks source link

minted and lstlisting environment should be parsed as `verbatim` #77

Closed pddg closed 6 months ago

pddg commented 6 months ago

Environment

Background

minted and lstlisting is a popular environment in LaTeX to show source code. It will contains some special chars for LaTeX and it should not be parsed normally.

https://www.overleaf.com/learn/latex/Code_Highlighting_with_minted https://www.overleaf.com/learn/latex/Code_listing#Using_listings_to_highlight_code

In other words, they are one of a verbatim environment.

Expected behavior

unified-latex parses them as verbatim environment.

Actual behavior

unified-latex parses them normaly. It will parsed as normal latex source when the syntax of embedded source code matches LaTeX's one.

I uses following command to show AST:

#!/usr/bin/env node

import * as fs from "fs";
import { getParser } from "@unified-latex/unified-latex-util-parse";

const content = fs.readFileSync("sample.tex");
const parser = getParser();
const parsedAst = parser.parse(content.toString());

console.log(JSON.stringify(parsedAst, undefined, "  "));

And sample.tex is as follows:

\begin{minted}{latex}
    \section{section title}

    this is an embedded source.
\end{minted}
AST of sample.tex ```json { "type": "root", "content": [ { "type": "environment", "env": "minted", "content": [ { "type": "group", "content": [ { "type": "string", "content": "latex", "position": { "start": { "offset": 15, "line": 1, "column": 16 }, "end": { "offset": 20, "line": 1, "column": 21 } } } ], "position": { "start": { "offset": 14, "line": 1, "column": 15 }, "end": { "offset": 21, "line": 1, "column": 22 } } }, { "type": "whitespace", "position": { "start": { "offset": 21, "line": 1, "column": 22 }, "end": { "offset": 26, "line": 2, "column": 5 } } }, { "type": "macro", "content": "section", "position": { "start": { "offset": 26, "line": 2, "column": 5 }, "end": { "offset": 34, "line": 2, "column": 13 } }, "_renderInfo": { "breakAround": true, "namedArguments": [ "starred", null, "tocTitle", "title" ] }, "args": [ { "type": "argument", "content": [], "openMark": "", "closeMark": "" }, { "type": "argument", "content": [], "openMark": "", "closeMark": "" }, { "type": "argument", "content": [], "openMark": "", "closeMark": "" }, { "type": "argument", "content": [ { "type": "string", "content": "section", "position": { "start": { "offset": 35, "line": 2, "column": 14 }, "end": { "offset": 42, "line": 2, "column": 21 } } }, { "type": "whitespace", "position": { "start": { "offset": 42, "line": 2, "column": 21 }, "end": { "offset": 43, "line": 2, "column": 22 } } }, { "type": "string", "content": "title", "position": { "start": { "offset": 43, "line": 2, "column": 22 }, "end": { "offset": 48, "line": 2, "column": 27 } } } ], "openMark": "{", "closeMark": "}" } ] }, { "type": "parbreak", "position": { "start": { "offset": 49, "line": 2, "column": 28 }, "end": { "offset": 55, "line": 4, "column": 5 } } }, { "type": "string", "content": "this", "position": { "start": { "offset": 55, "line": 4, "column": 5 }, "end": { "offset": 59, "line": 4, "column": 9 } } }, { "type": "whitespace", "position": { "start": { "offset": 59, "line": 4, "column": 9 }, "end": { "offset": 60, "line": 4, "column": 10 } } }, { "type": "string", "content": "is", "position": { "start": { "offset": 60, "line": 4, "column": 10 }, "end": { "offset": 62, "line": 4, "column": 12 } } }, { "type": "whitespace", "position": { "start": { "offset": 62, "line": 4, "column": 12 }, "end": { "offset": 63, "line": 4, "column": 13 } } }, { "type": "string", "content": "an", "position": { "start": { "offset": 63, "line": 4, "column": 13 }, "end": { "offset": 65, "line": 4, "column": 15 } } }, { "type": "whitespace", "position": { "start": { "offset": 65, "line": 4, "column": 15 }, "end": { "offset": 66, "line": 4, "column": 16 } } }, { "type": "string", "content": "embedded", "position": { "start": { "offset": 66, "line": 4, "column": 16 }, "end": { "offset": 74, "line": 4, "column": 24 } } }, { "type": "whitespace", "position": { "start": { "offset": 74, "line": 4, "column": 24 }, "end": { "offset": 75, "line": 4, "column": 25 } } }, { "type": "string", "content": "source", "position": { "start": { "offset": 75, "line": 4, "column": 25 }, "end": { "offset": 81, "line": 4, "column": 31 } } }, { "type": "string", "content": ".", "position": { "start": { "offset": 81, "line": 4, "column": 31 }, "end": { "offset": 82, "line": 4, "column": 32 } } } ], "position": { "start": { "offset": 0, "line": 1, "column": 1 }, "end": { "offset": 95, "line": 5, "column": 13 } } } ], "position": { "start": { "offset": 0, "line": 1, "column": 1 }, "end": { "offset": 96, "line": 6, "column": 1 } } } ```

37 seems to introduce support for macros of them, but not for environments.

siefkenj commented 6 months ago

Hmm. That is very strange. Are you using a current version or are you using the playground (I don't recall when I last updated the playground...)?

minted support was added a while ago to the grammar, so it should be working as a verbatim environment:

https://github.com/siefkenj/unified-latex/blob/9c7dc5e5d6f0310ad7366038fd2e341b20262b6c/packages/unified-latex-util-pegjs/grammars/latex.pegjs#L219

pddg commented 6 months ago

Oh. I try to prepare a reproduction procedure.

pddg commented 6 months ago

@siefkenj I am using v1.6.0, which seems to be the latest. https://www.npmjs.com/package/@unified-latex/unified-latex-util-parse

I can reproduce it in my environment by following procedures:

mkdir repro-issue77
cd repro-issue77

cat << 'EOF' > package.json
{
  "name": "unified-latex-issue77",
  "version": "1.0.0",
  "type": "module",
  "scripts": {},
  "bin": {
    "tex2ast": "./tex2ast.js"
  },
  "dependencies": {
    "@unified-latex/unified-latex-util-parse": "1.6.0"
  }
}
EOF

cat << 'EOF' > tex2ast.js
#!/usr/bin/env node

import * as fs from "fs";
import { getParser } from "@unified-latex/unified-latex-util-parse";

const content = fs.readFileSync("sample.tex");
const parser = getParser();
const parsedAst = parser.parse(content.toString());

console.log(JSON.stringify(parsedAst, undefined, "  "));
EOF

cat << 'EOF' > sample.tex
\begin{minted}{latex}
    \section{section title}

    this is an embedded source.
\end{minted}
EOF

# I uses v20.11.0
node -v

npm install
npx tex2ast

I may have misunderstood how to initialize the parser. If so, please point it out.

siefkenj commented 6 months ago

It appears you're right! The code is in the grammar, but it was never tested. I will look into it.

siefkenj commented 6 months ago

Should be fixed in v1.6.1

FelixZY commented 3 months ago

@siefkenj I'm still having problems in 1.7.1 using prettier-plugin-latex:

Example document:

\begin{minted}{csharp}
using System;

public class Program
{
  int a = 3;
  bool b = true;
  public static void Main()
  {
    // Write "Hello World" to the console
    Console.WriteLine("Hello World");
  }
}
\end{minted}

Expected

No change

Actual

\begin{minted}
    {csharp} using System;

    public class Program { int a = 3; bool b = true; public static void Main() { // Write "Hello World" to the console Console.WriteLine("Hello World"); } }
\end{minted}

Note: I have explicitly locked @unified-latex/unified-latex-ctan and @unified-latex/unified-latex-prettier to 1.7.1 (as well as 1.4.0, 1.6.0 and 1.6.1) to work around #96 . All of these versions seem affected and do not produce the expected output.