Support for NumPy-style docstrings

At my company, we actually use autodoc2 internally to build our documentation. While our full documentation build system is a little bespoke (relying, e.g., on a Bazel wrapper for Sphinx), we did early on decide that we want to have the ability to have all of our docs written using MyST-style Markdown while also using the (IMO more readable) NumpyDoc style for docstrings.

Despite the fact that NumpyDoc was designed with rST in mind, we are overall very happy with this decision, and so I got permission to share the wrapper code we use internally to produce these results.

Basically, the code below allows you to use both the standard NumpyDoc sections as well as the standard "parameter list" syntax from NumpyDoc, while preserving the ability to use MyST markdown within the body of each parameter and within in section, by using the NumpyDoc parser to convert the section titles and parameter lists into regular Markdown titles and lists before forwarding the string to autodoc's parser.

If I get a green light from whomever the maintainer is here, I'll happily clean this up to match any repo-specific style rules and send it in as a PR. In the meantime, these are the autodoc settings we use, feel free to use it directly until it (hopefully) gets officially incorporated. Note that there are some internal behaviors that you may want to exclude (such as the fact that our style guide forbids "Other Parameters" sections, and so we hackily use this section title to allow us to have an "Output Files" section instead without having to patch the NumpyDoc parser).

# In our `conf.py`.
autodoc2_packages = [
    str(relative_workspace_root / module) for module in modules_to_autodoc
]
autodoc2_render_plugin = "myst"
autodoc2_docstring_parser_regexes = [
    (
        ".*",
        "digital_biology.building.python.sphinx.docstrings",
    ),
]

"""
Helpers for parsing and rendering docstrings in Sphinx.

This file is located at `digital_biology/building/python/sphinx/docstrings.py` internally.

Copyright 2023 Digital Biology, Inc.
SPDX-License-Identifier: Apache-2.0
"""
import logging
import re
import textwrap
from typing import Final

from docutils import nodes
from myst_parser.parsers.sphinx_ import MystParser
from numpydoc.docscrape import NumpyDocString, Parameter

logger = logging.getLogger(__name__)

_PARAMETERS_SECTIONS: Final[tuple[str, ...]] = (
    "Parameters",
    "Returns",
    "Yields",
    "Receives",
    "Other Parameters",
    "Raises",
    "Warns",
    "Attributes",
)
_REGULAR_SECTIONS: Final[tuple[str, ...]] = (
    "Warnings",
    "Notes",
    "References",
    "Examples",
)

_OTHER_PARAMETERS = re.compile(".*Other Parameters *\n *--+ *\n")
"""
Regular expression to check if the "Other Parameters" section has been used.

We cannot allow users to include "Other Parameters" section, because we hijack it to
easily render our "Output Files" section without having to patch the NumpyDoc parser.

Matches text that looks like
:::
    Other Parameters
    ----------------
:::
"""
_OUTPUT_SECTION_TITLE = re.compile(
    "(?P<before>.*?)(?P<padding> *)Output Files\n *-+(?P<after>.*)",
    re.MULTILINE | re.DOTALL,
)
"""
Regular expression to extract the "Output Files" section title in rST format.

We use this to pretend the "Output Files" section title is actually "Other Parameters"
before passing it to the NumpyDoc parser.

Matches text that looks like
:::
{before}
{padding}Output Files
    -------
{after}
:::
"""

SeeAlsoReference = tuple[str, None]
"""In all examples given, there is a "None" here, like (numpy.dot, None)."""
SeeAlsoRelationship = list[str]
"""The (optional) relationship is empty if not provided, else one str per line."""
SingleSeeAlso = tuple[list[SeeAlsoReference], SeeAlsoRelationship]
"""One "entry" in the See Also section."""
SeeAlsoSection = list[SingleSeeAlso]
"""The full "See Also" section, as returned by `numpydoc.docscrape.NumpyDoc`."""

RegularSection = list[str]
"""One list element per (unstripped) line of input."""

def render_see_also(see_also: SingleSeeAlso) -> str:
    """Render a single NumpyDoc "See Also" reference."""
    references, relationship_lines = see_also
    output = ", ".join(ref_name for ref_name, _ in references)
    if relationship_lines:
        output += ": " + " ".join(relationship_lines)
    return output

def render_see_also_section(section: SeeAlsoSection) -> str:
    """Render the full "See Also" section from parsed output of NumpyDoc."""
    return "\n".join("- " + render_see_also(p) for p in section)

def render_regular_section(section: RegularSection) -> str:
    return textwrap.dedent("\n".join(section))

def render_parameter(parameter: Parameter) -> str:
    """Render a single NumpyDoc Parameter as Markdown."""
    output = ""
    if parameter.name:
        escaped_name = parameter.name.replace("*", r"\*")
        output += f"**{escaped_name}**"
    if parameter.type:
        escaped_type = parameter.type.replace("_", r"\_")
        output += f" (_{escaped_type}_)"
    if parameter.desc:
        output += ": " + " ".join(parameter.desc)
    return output

def render_parameter_section(section: list[Parameter]) -> str:
    """Convert parsed parameters into final markdown we want to render."""
    return "\n".join("- " + render_parameter(p) for p in section)

def _report_errors_in_docstring(doc: str, document: nodes.document) -> None:
    parsed = NumpyDocString(doc)
    for section_title in _PARAMETERS_SECTIONS:
        for parameter in parsed[section_title]:
            if ":" in parameter.name:
                document.reporter.warning(
                    f"Found colon in parameter name ({parameter.name}), please leave a space between the parameter name and the colon if you meant this to be a type annotation.",  # NOQA: E501
                    # If we don't explicitly pass "source" here, then
                    # `docutils.utils.system_message` will try to find both the source
                    # and line number. Unfortunately, `autodoc2.sphinx.docstring`
                    # patches the `get_source_and_line` function, causing this to fail
                    # for our message since we aren't tracking the line the same way
                    # `autodoc2` does. To prevent all this mess, we just pass "source".
                    source=document.current_source,
                )

def to_pure_markdown(doc: str, use_other_params_as_outputs: bool = True) -> str:
    """Convert a hybrid NumpyDoc/MyST docstring to pure Markdown."""
    parsed = NumpyDocString(doc)
    result = ""
    if summary := parsed["Summary"]:
        result += render_regular_section(summary)  # *Can* be multiple lines.
    if extended_summary := parsed["Extended Summary"]:
        result += "\n\n" + render_regular_section(extended_summary)
    for section_title in _PARAMETERS_SECTIONS:
        section = parsed[section_title]
        if not section:
            continue
        if section_title == "Other Parameters" and use_other_params_as_outputs:
            section_title = "Output Files"
        result += f"\n\n# {section_title}\n\n" + render_parameter_section(section)
    for section_title in _REGULAR_SECTIONS:
        section = parsed[section_title]
        if not section:
            continue
        result += f"\n\n# {section_title}\n\n" + render_regular_section(section)
    return result

def replace_output_files_title(doc: str, source: str | None) -> str:
    """
    Replace "Output Files" section name with "Other Parameters", as a parsing hack.

    As a side effect, this means we cannot allow users to use the "Other Parameters"
    section.

    Parameters
    ----------
    doc : str
        The docstring to cleanup.
    source : str
        A string describing the source file, if available. Otherwise '[UNKNOWN]' will be
        printed.

    Returns
    -------
    str : A docstring ready to hand to `numpydoc.docscrape.NumpyDocString`.
    """
    if _OTHER_PARAMETERS.match(doc):
        source = source or "[UNKNOWN]"
        raise ValueError(
            f"Encountered illegal section title 'Other Parameters' when processing source file: {source}\n"  # NOQA: E501
            "At Digital Biology, we do not use this section. Put **all** parameters in the main 'Parameters' section."  # NOQA: E501
        )
    to_replace = _OUTPUT_SECTION_TITLE.match(doc)
    if to_replace:
        group_dict = to_replace.groupdict()
        before = group_dict["before"]
        padding = group_dict["padding"]
        after = group_dict["after"]
        doc = f"{before}{padding}Other Parameters\n{padding}----------------\n{after}"
    return doc

class MystNumpyDocHybridParser(MystParser):
    """Hybrid docstring. Use NumpyDoc style, but allow Markdown instead of rST."""

    def parse(self, inputstring: str, document: nodes.document) -> None:
        """
        Parse source text.

        Parameters
        ----------
        inputstring: str
            The docstring to parse. Name intentionally chosen to match internal Sphinx
            usage.
        document: nodes.document
            The root docutils node to add AST elements to.
        """
        inputstring = replace_output_files_title(inputstring, document.source)
        _report_errors_in_docstring(inputstring, document)
        inputstring = to_pure_markdown(inputstring, use_other_params_as_outputs=True)
        return super().parse(inputstring, document)

Parser = MystNumpyDocHybridParser

@relativistic I was also searching for a Google-style parser but without any results so I took the interface from the @bruno-digitbio snippet and I've asked ChatGPT to generate the parser. Here it is:

import re

from docutils import nodes
from myst_parser.parsers.sphinx_ import MystParser

class GoogleStyleDocstringParser(MystParser):
    def parse(self, inputstring: str, document: nodes.document) -> None:
        parsed_content = self._parse_google_style_docstring(inputstring)
        return super().parse(parsed_content, document)

    def _parse_google_style_docstring(self, docstring: str) -> str:
        description, params, returns, raises, examples = self._extract_sections(docstring)
        myst_docstring = description + "\n\n"

        if params:
            myst_docstring += "```{eval-rst}\n"
            for param in params:
                myst_docstring += f":param {param[0]}: {param[1]}\n"
            myst_docstring += "```\n\n"

        if returns:
            myst_docstring += "```{eval-rst}\n"
            myst_docstring += f":returns: {returns[0]}\n"
            myst_docstring += "```\n\n"

        if raises:
            myst_docstring += "```{eval-rst}\n"
            for exc in raises:
                myst_docstring += f":raises {exc[0]}: {exc[1]}\n"
            myst_docstring += "```\n\n"

        if examples:
            myst_docstring += "**Examples**\n"
            myst_docstring += "```\n"
            myst_docstring += examples
            myst_docstring += "\n```\n\n"

        return myst_docstring.strip()

    def _extract_sections(self, docstring: str):
        description = self._extract_description(docstring)
        params = self._extract_params(docstring)
        returns = self._extract_returns(docstring)
        raises = self._extract_raises(docstring)
        examples = self._extract_examples(docstring)
        return description, params, returns, raises, examples

    def _extract_description(self, docstring: str) -> str:
        match = re.match(r'(.*?)(?=\n[A-Z][a-z]+:|\n\n|$)', docstring, re.S)
        return match.group(0).strip() if match else ''

    def _extract_params(self, docstring: str):
        params = []
        param_section = re.search(r'Args:\n(.*?)(?=\n[A-Z][a-z]+:|\n\n|$)', docstring, re.S)
        if param_section:
            param_matches = re.findall(
                r'\s*([\w_]+):\s*(.*?)\s*(?=\n\s*[\w_]+:|\n\n|$)', param_section.group(1), re.S
            )
            params.extend(param_matches)
        return params

    def _extract_returns(self, docstring: str):
        return_section = re.search(r'Returns:\n(.*?)(?=\n[A-Z][a-z]+:|\n\n|$)', docstring, re.S)
        if return_section:
            match = re.match(r'\s*(.*?)\s*(?=\n\n|$)', return_section.group(1), re.S)
            if match:
                return match.groups()
        return None

    def _extract_raises(self, docstring: str):
        raises = []
        raises_section = re.search(r'Raises:\n(.*?)(?=\n[A-Z][a-z]+:|\n\n|$)', docstring, re.S)
        if raises_section:
            raise_matches = re.findall(
                r'\s*([\w_]+):\s*(.*?)\s*(?=\n\s*[\w_]+:|\n\n|$)', raises_section.group(1), re.S
            )
            for match in raise_matches:
                raises.append(match)
        return raises

    def _extract_examples(self, docstring: str):
        examples_section = re.search(r'Examples?:\n(.*)', docstring, re.S)
        return examples_section.group(1).strip() if examples_section else ''

Parser = GoogleStyleDocstringParser

You can put the parser in the docstrings_parser.py file in the root directory and then in the conf.py

autodoc2_docstring_parser_regexes = [
    (
        ".*",
        "docstrings_parser",
    ),
]

Important: this version doesn't support arguments with types (eg. minimum (int): A port value greater or equal to 1024.) as the type should be already specified in the function signature. So the supported format is:

  def connect_to_next_port(self, minimum: int) -> int:
    """Connects to the next available port.

    Args:
      minimum: A port value greater or equal to 1024.

    Returns:
      The new minimum port.

    Raises:
      ConnectionError: If no available port is found.
    """

Result

sphinx-extensions2 / sphinx-autodoc2

Support for NumPy-style docstrings #33