jgm / pandoc

Universal markup converter
https://pandoc.org
Other
34.39k stars 3.37k forks source link

Missing comments after converting pandoc json to docx file #8189

Open dawidreedsy opened 2 years ago

dawidreedsy commented 2 years ago

Explain the problem. I have this docx file that contains comment and the response to the comment like this: Screenshot 2022-07-18 at 18 07 45

before.docx Using this command i convert it to pandoc json:

pandoc before.docx --track-changes=all -f docx -t json > pandoc-result.json

I got this json

{
  "pandoc-api-version": [
    1,
    22,
    1
  ],
  "meta": {},
  "blocks": [
    {
      "t": "Para",
      "c": [
        {
          "t": "Span",
          "c": [
            [
              "",
              [
                "comment-start"
              ],
              [
                [
                  "id",
                  "0"
                ],
                [
                  "author",
                  "Dawid Kisielewski"
                ],
                [
                  "date",
                  "2022-07-18T15:20:46Z"
                ]
              ]
            ],
            [
              {
                "t": "Str",
                "c": "Comment"
              }
            ]
          ]
        },
        {
          "t": "Span",
          "c": [
            [
              "",
              [
                "comment-start"
              ],
              [
                [
                  "id",
                  "1"
                ],
                [
                  "author",
                  "Dawid Kisielewski"
                ],
                [
                  "date",
                  "2022-07-18T15:20:50Z"
                ]
              ]
            ],
            [
              {
                "t": "Str",
                "c": "Reply"
              }
            ]
          ]
        },
        {
          "t": "Str",
          "c": "Start"
        },
        {
          "t": "Span",
          "c": [
            [
              "",
              [
                "comment-end"
              ],
              [
                [
                  "id",
                  "0"
                ]
              ]
            ],
            [
              {
                "t": "Span",
                "c": [
                  [
                    "",
                    [
                      "comment-end"
                    ],
                    [
                      [
                        "id",
                        "1"
                      ]
                    ]
                  ],
                  []
                ]
              }
            ]
          ]
        }
      ]
    }
  ]
}

As you can see the json look ok it has the comment and the reply but if i convert it back to the docx file. Using this command:

pandoc middle.json --track-changes=all -f json -t docx -o after.docx

This is the file i got: after.docx

It has only one comment with no reply: Screenshot 2022-07-18 at 18 10 27

I did a little bit of digging and it seems that the word/document.xml inside the docx is not converted properly from the pandoc json format. The original document.xml:

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:sl="http://schemas.openxmlformats.org/schemaLibrary/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture" xmlns:c="http://schemas.openxmlformats.org/drawingml/2006/chart" xmlns:lc="http://schemas.openxmlformats.org/drawingml/2006/lockedCanvas" xmlns:dgm="http://schemas.openxmlformats.org/drawingml/2006/diagram" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml">
  <w:body>
    <w:p w:rsidR="00000000" w:rsidDel="00000000" w:rsidP="00000000" w:rsidRDefault="00000000" w:rsidRPr="00000000" w14:paraId="00000001">
      <w:pPr>
        <w:keepNext w:val="0" />
        <w:keepLines w:val="0" />
        <w:pageBreakBefore w:val="0" />
        <w:widowControl w:val="1" />
        <w:pBdr>
          <w:top w:space="0" w:sz="0" w:val="nil" />
          <w:left w:space="0" w:sz="0" w:val="nil" />
          <w:bottom w:space="0" w:sz="0" w:val="nil" />
          <w:right w:space="0" w:sz="0" w:val="nil" />
          <w:between w:space="0" w:sz="0" w:val="nil" />
        </w:pBdr>
        <w:shd w:fill="auto" w:val="clear" />
        <w:spacing w:after="180" w:before="180" w:line="240" w:lineRule="auto" />
        <w:ind w:left="0" w:right="0" w:firstLine="0" />
        <w:jc w:val="left" />
        <w:rPr>
          <w:rFonts w:ascii="Cambria" w:cs="Cambria" w:eastAsia="Cambria" w:hAnsi="Cambria" />
          <w:b w:val="0" />
          <w:i w:val="0" />
          <w:smallCaps w:val="0" />
          <w:strike w:val="0" />
          <w:color w:val="000000" />
          <w:sz w:val="24" />
          <w:szCs w:val="24" />
          <w:u w:val="none" />
          <w:shd w:fill="auto" w:val="clear" />
          <w:vertAlign w:val="baseline" />
        </w:rPr>
      </w:pPr>
      <w:sdt>
        <w:sdtPr>
          <w:tag w:val="goog_rdk_0" />
        </w:sdtPr>
        <w:sdtContent>
          <w:commentRangeStart w:id="0" />
        </w:sdtContent>
      </w:sdt>
      <w:sdt>
        <w:sdtPr>
          <w:tag w:val="goog_rdk_1" />
        </w:sdtPr>
        <w:sdtContent>
          <w:commentRangeStart w:id="1" />
        </w:sdtContent>
      </w:sdt>
      <w:r w:rsidDel="00000000" w:rsidR="00000000" w:rsidRPr="00000000">
        <w:rPr>
          <w:rFonts w:ascii="Cambria" w:cs="Cambria" w:eastAsia="Cambria" w:hAnsi="Cambria" />
          <w:b w:val="0" />
          <w:i w:val="0" />
          <w:smallCaps w:val="0" />
          <w:strike w:val="0" />
          <w:color w:val="000000" />
          <w:sz w:val="24" />
          <w:szCs w:val="24" />
          <w:u w:val="none" />
          <w:shd w:fill="auto" w:val="clear" />
          <w:vertAlign w:val="baseline" />
          <w:rtl w:val="0" />
        </w:rPr>
        <w:t xml:space="preserve">Start</w:t>
      </w:r>
      <w:commentRangeEnd w:id="0" />
      <w:r w:rsidDel="00000000" w:rsidR="00000000" w:rsidRPr="00000000">
        <w:commentReference w:id="0" />
      </w:r>
      <w:commentRangeEnd w:id="1" />
      <w:r w:rsidDel="00000000" w:rsidR="00000000" w:rsidRPr="00000000">
        <w:commentReference w:id="1" />
      </w:r>
      <w:r w:rsidDel="00000000" w:rsidR="00000000" w:rsidRPr="00000000">
        <w:rPr>
          <w:rtl w:val="0" />
        </w:rPr>
      </w:r>
    </w:p>
    <w:sectPr>
      <w:pgSz w:h="15840" w:w="12240" w:orient="portrait" />
      <w:pgMar w:bottom="1440" w:top="1440" w:left="1440" w:right="1440" w:header="360" w:footer="360" />
      <w:pgNumType w:start="1" />
    </w:sectPr>
  </w:body>
</w:document>

The one generated by pandoc:

<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing">
  <w:body>
    <w:p>
      <w:pPr>
        <w:pStyle w:val="FirstParagraph" />
      </w:pPr>
      <w:commentRangeStart w:id="0" />
      <w:commentRangeStart w:id="1" />
      <w:r>
        <w:t xml:space="preserve">Start</w:t>
      </w:r>
      <w:commentRangeEnd w:id="0" />
      <w:r>
        <w:rPr>
          <w:rStyle w:val="CommentReference" />
        </w:rPr>
        <w:commentReference w:id="0" />
      </w:r>
    </w:p>
    <w:sectPr />
  </w:body>
</w:document>

It seems like the reference to the reply of the comment is missing as in the original file we have this

      <w:commentRangeEnd w:id="1" />
      <w:r w:rsidDel="00000000" w:rsidR="00000000" w:rsidRPr="00000000">
        <w:commentReference w:id="1" />
      </w:r>

Pandoc version? Pandoc version: 1.22.1 Pandoc api version: 1.22.2 Pandoc version: 2.18 Os version: macOs Monterey 12.4 M1 Mac

jgm commented 2 years ago

@jkr can you see what is going on here?

jgm commented 2 years ago

@dawidreedsy before we bother @jkr to look at this, could you try with pandoc 2.18, the latest version? I noticed that you're using an ancient version of pandoc.

dawidreedsy commented 2 years ago

@jgm Yeah sure, give me few minutes. FYI I took the version from the pandoc json

  "pandoc-api-version": [
    1,
    22,
    1
  ],

but when i do pandoc --version i got

pandoc 2.17.1.1
Compiled with pandoc-types 1.22.1, texmath 0.12.4, skylighting 0.12.2,
citeproc 0.6.0.1, ipynb 0.2
dawidreedsy commented 2 years ago

@jgm Used the latest version:

pandoc 2.18
Compiled with pandoc-types 1.22.2, texmath 0.12.5, skylighting 0.12.3,
citeproc 0.7, ipynb 0.2, hslua 2.2.0
Scripting engine: Lua 5.4

and have the same result

alecgibson commented 2 years ago

I think a part of the issue here is that the comment-end spans are nested within one another.

If I manually rearrange the JSON so that they're serial (not nested), like the comment-start, then I get the correct output:

{
  "pandoc-api-version": [
    1,
    22,
    2
  ],
  "meta": {},
  "blocks": [
    {
      "t": "Para",
      "c": [
        {
          "t": "Span",
          "c": [
            [
              "",
              [
                "comment-start"
              ],
              [
                [
                  "id",
                  "0"
                ],
                [
                  "author",
                  "Dawid Kisielewski"
                ],
                [
                  "date",
                  "2022-07-18T15:20:46Z"
                ]
              ]
            ],
            [
              {
                "t": "Str",
                "c": "Comment"
              }
            ]
          ]
        },
        {
          "t": "Span",
          "c": [
            [
              "",
              [
                "comment-start"
              ],
              [
                [
                  "id",
                  "1"
                ],
                [
                  "author",
                  "Dawid Kisielewski"
                ],
                [
                  "date",
                  "2022-07-18T15:20:50Z"
                ]
              ]
            ],
            [
              {
                "t": "Str",
                "c": "Reply"
              }
            ]
          ]
        },
        {
          "t": "Str",
          "c": "Start"
        },
        {
          "t": "Span",
          "c": [
            [
              "",
              [
                "comment-end"
              ],
              [
                [
                  "id",
                  "0"
                ]
              ]
            ],
            []
          ]
        },
        {
          "t": "Span",
          "c": [
            [
              "",
              [
                "comment-end"
              ],
              [
                [
                  "id",
                  "1"
                ]
              ]
            ],
            []
          ]
        }
      ]
    }
  ]
}

Here's the resulting doc:

Screenshot 2022-08-16 at 11 30 06

after.docx

dawidreedsy commented 2 years ago

@alecgibson It is actaully not a correct output as there should be one comment with reply not two comments.